diff mbox series

libstdc++: Speed up push_back

Message ID ZVqD4QG/X2nj6GrP@kam.mff.cuni.cz
State New
Headers show
Series libstdc++: Speed up push_back | expand

Commit Message

Jan Hubicka Nov. 19, 2023, 9:53 p.m. UTC
Hi,
this patch speeds up the push_back at -O3 significantly by making the
reallocation to be inlined by default.  _M_realloc_insert is general
insertion that takes iterator pointing to location where the value
should be inserted.  As such it contains code to move other entries around
that is quite large.

Since appending to the end of array is common operation, I think we should
have specialized code for that.  Sadly it is really hard to work out this
from IPA passes, since we basically care whether the iterator points to
the same place as the end pointer, which are both passed by reference.
This is inter-procedural value numbering that is quite out of reach.

I also added extra check making it clear that the new length of the vector
is non-zero.  This saves extra conditionals.  Again it is quite hard case
since _M_check_len seem to be able to return 0 if its parameter is 0.
This never happens here, but we are not able to propagate this early nor
at IPA stage.

Would it be OK to duplciate code as this?  The resulting code is still not quite
optimal.

Regtested on x86_64-linux, OK?

Honza

void std::vector<pair_t>::_M_realloc_append<const pair_t&> (struct vector * const this, const struct pair_t & __args#0)
{
  struct _Guard __guard;
  struct pair_t * __new_finish;
  struct pair_t * __old_finish;
  struct pair_t * __old_start;
  struct _Vector_impl * _1;
  long unsigned int _2;
  struct pair_t * _3;
  struct pair_t * _4;
  long int _5;
  long int _6;
  long unsigned int _7;
  long unsigned int _8;
  struct pair_t * _9;
  const size_type _13;
  struct pair_t * _16;
  struct _Vector_impl * _18;
  long int _27;
  long unsigned int _34;

  <bb 2> [local count: 1073741824]:
  _13 = std::vector<pair_t>::_M_check_len (this_11(D), 1, "vector::_M_realloc_append");
  if (_13 == 0)
    goto <bb 3>; [0.00%]
  else
    goto <bb 4>; [100.00%]

  <bb 3> [count: 0]:
  __builtin_unreachable ();

  <bb 4> [local count: 1073741824]:
  __old_start_14 = this_11(D)->D.26060._M_impl.D.25361._M_start;
  __old_finish_15 = this_11(D)->D.26060._M_impl.D.25361._M_finish;
  _27 = __old_finish_15 - __old_start_14;
  _18 = &MEM[(struct _Vector_base *)this_11(D)]._M_impl;
  _16 = std::__new_allocator<pair_t>::allocate (_18, _13, 0B);
  _1 = &this_11(D)->D.26060._M_impl;
  __guard ={v} {CLOBBER};
  __guard._M_alloc = _1;
  _2 = (long unsigned int) _27;
  _3 = _16 + _2;
  *_3 = *__args#0_17(D);
  if (_27 > 0)
    goto <bb 5>; [41.48%]
  else
    goto <bb 6>; [58.52%]

  <bb 5> [local count: 445388112]:
  __builtin_memmove (_16, __old_start_14, _2);

  <bb 6> [local count: 1073741824]:
  _34 = _2 + 8;
  __new_finish_19 = _16 + _34;
  __guard._M_storage = __old_start_14;
  _4 = this_11(D)->D.26060._M_impl.D.25361._M_end_of_storage;
  _5 = _4 - __old_start_14;
  _6 = _5 /[ex] 8;
  _7 = (long unsigned int) _6;
  __guard._M_len = _7;
  std::vector<pair_t>::_M_realloc_append<const pair_t&>(const pair_t&)::_Guard::~_Guard (&__guard);
  __guard ={v} {CLOBBER(eol)};
  this_11(D)->D.26060._M_impl.D.25361._M_start = _16;
  this_11(D)->D.26060._M_impl.D.25361._M_finish = __new_finish_19;
  _8 = _13 * 8;
  _9 = _16 + _8;
  this_11(D)->D.26060._M_impl.D.25361._M_end_of_storage = _9;
  return;

}

Notice that memmove can be memcopy and the test whether block size is non-zero is useless.


libstdc++-v3/ChangeLog:

	PR libstdc++/110287
	* include/bits/stl_vector.h (_M_realloc_append): New member function.
	(push_back): Use it.
	* include/bits/vector.tcc: (emplace_back): Use it.
	(_M_realloc_insert): Let compiler know that new vector size is non-zero.
	(_M_realloc_append): New member function.

Comments

Jonathan Wakely Nov. 20, 2023, 12:09 p.m. UTC | #1
On Sun, 19 Nov 2023 at 21:53, Jan Hubicka <hubicka@ucw.cz> wrote:
>
> Hi,
> this patch speeds up the push_back at -O3 significantly by making the
> reallocation to be inlined by default.  _M_realloc_insert is general
> insertion that takes iterator pointing to location where the value
> should be inserted.  As such it contains code to move other entries around
> that is quite large.
>
> Since appending to the end of array is common operation, I think we should
> have specialized code for that.  Sadly it is really hard to work out this
> from IPA passes, since we basically care whether the iterator points to
> the same place as the end pointer, which are both passed by reference.
> This is inter-procedural value numbering that is quite out of reach.
>
> I also added extra check making it clear that the new length of the vector
> is non-zero.  This saves extra conditionals.  Again it is quite hard case
> since _M_check_len seem to be able to return 0 if its parameter is 0.
> This never happens here, but we are not able to propagate this early nor
> at IPA stage.
>
> Would it be OK to duplciate code as this?  The resulting code is still not quite
> optimal.
>
> Regtested on x86_64-linux, OK?
>
> Honza
>
> void std::vector<pair_t>::_M_realloc_append<const pair_t&> (struct vector * const this, const struct pair_t & __args#0)
> {
>   struct _Guard __guard;
>   struct pair_t * __new_finish;
>   struct pair_t * __old_finish;
>   struct pair_t * __old_start;
>   struct _Vector_impl * _1;
>   long unsigned int _2;
>   struct pair_t * _3;
>   struct pair_t * _4;
>   long int _5;
>   long int _6;
>   long unsigned int _7;
>   long unsigned int _8;
>   struct pair_t * _9;
>   const size_type _13;
>   struct pair_t * _16;
>   struct _Vector_impl * _18;
>   long int _27;
>   long unsigned int _34;
>
>   <bb 2> [local count: 1073741824]:
>   _13 = std::vector<pair_t>::_M_check_len (this_11(D), 1, "vector::_M_realloc_append");
>   if (_13 == 0)
>     goto <bb 3>; [0.00%]
>   else
>     goto <bb 4>; [100.00%]
>
>   <bb 3> [count: 0]:
>   __builtin_unreachable ();
>
>   <bb 4> [local count: 1073741824]:
>   __old_start_14 = this_11(D)->D.26060._M_impl.D.25361._M_start;
>   __old_finish_15 = this_11(D)->D.26060._M_impl.D.25361._M_finish;
>   _27 = __old_finish_15 - __old_start_14;
>   _18 = &MEM[(struct _Vector_base *)this_11(D)]._M_impl;
>   _16 = std::__new_allocator<pair_t>::allocate (_18, _13, 0B);
>   _1 = &this_11(D)->D.26060._M_impl;
>   __guard ={v} {CLOBBER};
>   __guard._M_alloc = _1;
>   _2 = (long unsigned int) _27;
>   _3 = _16 + _2;
>   *_3 = *__args#0_17(D);
>   if (_27 > 0)
>     goto <bb 5>; [41.48%]
>   else
>     goto <bb 6>; [58.52%]
>
>   <bb 5> [local count: 445388112]:
>   __builtin_memmove (_16, __old_start_14, _2);
>
>   <bb 6> [local count: 1073741824]:
>   _34 = _2 + 8;
>   __new_finish_19 = _16 + _34;
>   __guard._M_storage = __old_start_14;
>   _4 = this_11(D)->D.26060._M_impl.D.25361._M_end_of_storage;
>   _5 = _4 - __old_start_14;
>   _6 = _5 /[ex] 8;
>   _7 = (long unsigned int) _6;
>   __guard._M_len = _7;
>   std::vector<pair_t>::_M_realloc_append<const pair_t&>(const pair_t&)::_Guard::~_Guard (&__guard);
>   __guard ={v} {CLOBBER(eol)};
>   this_11(D)->D.26060._M_impl.D.25361._M_start = _16;
>   this_11(D)->D.26060._M_impl.D.25361._M_finish = __new_finish_19;
>   _8 = _13 * 8;
>   _9 = _16 + _8;
>   this_11(D)->D.26060._M_impl.D.25361._M_end_of_storage = _9;
>   return;
>
> }
>
> Notice that memmove can be memcopy and the test whether block size is non-zero is useless.
>
>
> libstdc++-v3/ChangeLog:
>
>         PR libstdc++/110287
>         * include/bits/stl_vector.h (_M_realloc_append): New member function.
>         (push_back): Use it.
>         * include/bits/vector.tcc: (emplace_back): Use it.
>         (_M_realloc_insert): Let compiler know that new vector size is non-zero.
>         (_M_realloc_append): New member function.
>
> diff --git a/libstdc++-v3/include/bits/stl_vector.h b/libstdc++-v3/include/bits/stl_vector.h
> index 5e18f6eedce..973f4d7e2e9 100644
> --- a/libstdc++-v3/include/bits/stl_vector.h
> +++ b/libstdc++-v3/include/bits/stl_vector.h
> @@ -1288,7 +1288,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>             _GLIBCXX_ASAN_ANNOTATE_GREW(1);
>           }
>         else
> -         _M_realloc_insert(end(), __x);
> +         _M_realloc_append(__x);
>        }
>
>  #if __cplusplus >= 201103L
> @@ -1822,6 +1822,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>
>        void
>        _M_realloc_insert(iterator __position, const value_type& __x);
> +
> +      void
> +      _M_realloc_append(const value_type& __x);
>  #else
>        // A value_type object constructed with _Alloc_traits::construct()
>        // and destroyed with _Alloc_traits::destroy().
> @@ -1871,6 +1874,11 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>         void
>         _M_realloc_insert(iterator __position, _Args&&... __args);
>
> +      template<typename... _Args>
> +       _GLIBCXX20_CONSTEXPR
> +       void
> +       _M_realloc_append(_Args&&... __args);
> +
>        // Either move-construct at the end, or forward to _M_insert_aux.
>        _GLIBCXX20_CONSTEXPR
>        iterator
> diff --git a/libstdc++-v3/include/bits/vector.tcc b/libstdc++-v3/include/bits/vector.tcc
> index 80631d1e2a1..1306676e795 100644
> --- a/libstdc++-v3/include/bits/vector.tcc
> +++ b/libstdc++-v3/include/bits/vector.tcc
> @@ -120,7 +120,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>             _GLIBCXX_ASAN_ANNOTATE_GREW(1);
>           }
>         else
> -         _M_realloc_insert(end(), std::forward<_Args>(__args)...);
> +         _M_realloc_append(std::forward<_Args>(__args)...);
>  #if __cplusplus > 201402L
>         return back();
>  #endif
> @@ -459,6 +459,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>  #endif
>      {
>        const size_type __len = _M_check_len(1u, "vector::_M_realloc_insert");
> +      if (__len <= 0)
> +       __builtin_unreachable ();
>        pointer __old_start = this->_M_impl._M_start;
>        pointer __old_finish = this->_M_impl._M_finish;
>        const size_type __elems_before = __position - begin();
> @@ -571,6 +573,129 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>        this->_M_impl._M_end_of_storage = __new_start + __len;
>      }
>
> +#if __cplusplus >= 201103L
> +  template<typename _Tp, typename _Alloc>
> +    template<typename... _Args>
> +      _GLIBCXX20_CONSTEXPR
> +      void
> +      vector<_Tp, _Alloc>::
> +      _M_realloc_append(_Args&&... __args)
> +#else
> +  template<typename _Tp, typename _Alloc>
> +    void
> +    vector<_Tp, _Alloc>::
> +    _M_realloc_append(const _Tp& __x)
> +#endif
> +    {
> +      const size_type __len = _M_check_len(1u, "vector::_M_realloc_append");
> +      if (__len <= 0)
> +       __builtin_unreachable ();
> +      pointer __old_start = this->_M_impl._M_start;
> +      pointer __old_finish = this->_M_impl._M_finish;
> +      const size_type __elems = end() - begin();
> +      pointer __new_start(this->_M_allocate(__len));
> +      pointer __new_finish(__new_start);
> +
> +      // RAII guard for allocated storage.
> +      struct _Guard
> +      {
> +       pointer _M_storage;         // Storage to deallocate
> +       size_type _M_len;
> +       _Tp_alloc_type& _M_alloc;
> +
> +       _GLIBCXX20_CONSTEXPR
> +       _Guard(pointer __s, size_type __l, _Tp_alloc_type& __a)
> +       : _M_storage(__s), _M_len(__l), _M_alloc(__a)
> +       { }
> +
> +       _GLIBCXX20_CONSTEXPR
> +       ~_Guard()
> +       {
> +         if (_M_storage)
> +           __gnu_cxx::__alloc_traits<_Tp_alloc_type>::
> +             deallocate(_M_alloc, _M_storage, _M_len);
> +       }
> +
> +      private:
> +       _Guard(const _Guard&);
> +      };
> +
> +      {
> +       _Guard __guard(__new_start, __len, _M_impl);
> +
> +       // The order of the three operations is dictated by the C++11
> +       // case, where the moves could alter a new element belonging
> +       // to the existing vector.  This is an issue only for callers
> +       // taking the element by lvalue ref (see last bullet of C++11
> +       // [res.on.arguments]).
> +
> +       // If this throws, the existing elements are unchanged.
> +#if __cplusplus >= 201103L
> +       _Alloc_traits::construct(this->_M_impl,
> +                                std::__to_address(__new_start + __elems),
> +                                std::forward<_Args>(__args)...);
> +#else
> +       _Alloc_traits::construct(this->_M_impl,
> +                                __new_start + __elems,
> +                                __x);
> +#endif
> +
> +#if __cplusplus >= 201103L
> +       if _GLIBCXX17_CONSTEXPR (_S_use_relocate())
> +         {
> +           // Relocation cannot throw.
> +           __new_finish = _S_relocate(__old_start, __old_finish,
> +                                      __new_start, _M_get_Tp_allocator());
> +           ++__new_finish;
> +         }
> +       else
> +#endif
> +         {
> +           // RAII type to destroy initialized elements.

There's only one initialized element, not "elements".

> +           struct _Guard_elts
> +           {
> +             pointer _M_first, _M_last;  // Elements to destroy

We only need to store one pointer here, call it _M_p.

> +             _Tp_alloc_type& _M_alloc;
> +
> +             _GLIBCXX20_CONSTEXPR
> +             _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
> +             : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
> +             { }
> +
> +             _GLIBCXX20_CONSTEXPR
> +             ~_Guard_elts()
> +             { std::_Destroy(_M_first, _M_last, _M_alloc); }

This should be either:

  std::_Destroy(_M_p, _M_p+1, _M_alloc);

or avoid the loop that happens in that _Destroy function:

  _Alloc_traits::destroy(_M_alloc, _M_p);

> +
> +           private:
> +             _Guard_elts(const _Guard_elts&);
> +           };
> +
> +           // Guard the new element so it will be destroyed if anything throws.
> +           _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
> +
> +           __new_finish = std::__uninitialized_move_if_noexcept_a(
> +                            __old_start, __old_finish,
> +                            __new_start, _M_get_Tp_allocator());
> +
> +           ++__new_finish;
> +           // Guard everything before the new element too.
> +           __guard_elts._M_first = __new_start;

This seems redundant, we're not doing any more insertions now, and so
this store is dead.

> +
> +           // New storage has been fully initialized, destroy the old elements.
> +           __guard_elts._M_first = __old_start;
> +           __guard_elts._M_last = __old_finish;
> +         }
> +       __guard._M_storage = __old_start;
> +       __guard._M_len = this->_M_impl._M_end_of_storage - __old_start;
> +      }
> +      // deallocate should be called before assignments to _M_impl,
> +      // to avoid call-clobbering
> +
> +      this->_M_impl._M_start = __new_start;
> +      this->_M_impl._M_finish = __new_finish;
> +      this->_M_impl._M_end_of_storage = __new_start + __len;
> +    }
> +
>    template<typename _Tp, typename _Alloc>
>      _GLIBCXX20_CONSTEXPR
>      void
>
Jan Hubicka Nov. 20, 2023, 3:44 p.m. UTC | #2
> > +           // RAII type to destroy initialized elements.
> 
> There's only one initialized element, not "elements".
> 
> > +           struct _Guard_elts
> > +           {
> > +             pointer _M_first, _M_last;  // Elements to destroy
> 
> We only need to store one pointer here, call it _M_p.
> 
> > +             _Tp_alloc_type& _M_alloc;
> > +
> > +             _GLIBCXX20_CONSTEXPR
> > +             _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
> > +             : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
> > +             { }
> > +
> > +             _GLIBCXX20_CONSTEXPR
> > +             ~_Guard_elts()
> > +             { std::_Destroy(_M_first, _M_last, _M_alloc); }
> 
> This should be either:
> 
>   std::_Destroy(_M_p, _M_p+1, _M_alloc);
> 
> or avoid the loop that happens in that _Destroy function:
> 
>   _Alloc_traits::destroy(_M_alloc, _M_p);
> 
> > +
> > +           private:
> > +             _Guard_elts(const _Guard_elts&);
> > +           };
> > +
> > +           // Guard the new element so it will be destroyed if anything throws.
> > +           _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
> > +
> > +           __new_finish = std::__uninitialized_move_if_noexcept_a(
> > +                            __old_start, __old_finish,
> > +                            __new_start, _M_get_Tp_allocator());
> > +
> > +           ++__new_finish;
> > +           // Guard everything before the new element too.
> > +           __guard_elts._M_first = __new_start;
> 
> This seems redundant, we're not doing any more insertions now, and so
> this store is dead.

I removed this one.
> 
> > +
> > +           // New storage has been fully initialized, destroy the old elements.
> > +           __guard_elts._M_first = __old_start;
> > +           __guard_elts._M_last = __old_finish;

However here I think I need __guard_elts supporting destruction of many
elements in case the vector has moved to new location....
So I do not quite  see how to simplify the code as suggested above
except that the constructor can be simplified to not require first and
last argument since we always initialize it for 1 destruction but later
we may update it.

Thanks,
Honza
Jonathan Wakely Nov. 20, 2023, 4:46 p.m. UTC | #3
On Mon, 20 Nov 2023 at 15:44, Jan Hubicka <hubicka@ucw.cz> wrote:
>
> > > +           // RAII type to destroy initialized elements.
> >
> > There's only one initialized element, not "elements".
> >
> > > +           struct _Guard_elts
> > > +           {
> > > +             pointer _M_first, _M_last;  // Elements to destroy
> >
> > We only need to store one pointer here, call it _M_p.
> >
> > > +             _Tp_alloc_type& _M_alloc;
> > > +
> > > +             _GLIBCXX20_CONSTEXPR
> > > +             _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
> > > +             : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
> > > +             { }
> > > +
> > > +             _GLIBCXX20_CONSTEXPR
> > > +             ~_Guard_elts()
> > > +             { std::_Destroy(_M_first, _M_last, _M_alloc); }
> >
> > This should be either:
> >
> >   std::_Destroy(_M_p, _M_p+1, _M_alloc);
> >
> > or avoid the loop that happens in that _Destroy function:
> >
> >   _Alloc_traits::destroy(_M_alloc, _M_p);
> >
> > > +
> > > +           private:
> > > +             _Guard_elts(const _Guard_elts&);
> > > +           };
> > > +
> > > +           // Guard the new element so it will be destroyed if anything throws.
> > > +           _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
> > > +
> > > +           __new_finish = std::__uninitialized_move_if_noexcept_a(
> > > +                            __old_start, __old_finish,
> > > +                            __new_start, _M_get_Tp_allocator());
> > > +
> > > +           ++__new_finish;
> > > +           // Guard everything before the new element too.
> > > +           __guard_elts._M_first = __new_start;
> >
> > This seems redundant, we're not doing any more insertions now, and so
> > this store is dead.
>
> I removed this one.
> >
> > > +
> > > +           // New storage has been fully initialized, destroy the old elements.
> > > +           __guard_elts._M_first = __old_start;
> > > +           __guard_elts._M_last = __old_finish;
>
> However here I think I need __guard_elts supporting destruction of many
> elements in case the vector has moved to new location....

Ah yes.

> So I do not quite  see how to simplify the code as suggested above
> except that the constructor can be simplified to not require first and
> last argument since we always initialize it for 1 destruction but later
> we may update it.

We could just destroy the old ones manually at the end of this block,
it doesn't need to be done by the guard, e.g. update the guard to
destroy the first one, then loop to destroy the others:

__guard_elt._M_p = __old_start++;
std::_Destroy(__old_start, __old_finish, this->_M_impl);

But this would only improve codegen in the exceptional case when
something throws, and the guard destructor destroys a single element.
For the common case where we reach the end of the block, we're always
going to loop. So I agree with leaving __guard_elts in charge of two
pointers, and looping in its dtor.

So OK for trunk, with the dead store removed. Thanks!
Jan Hubicka Nov. 21, 2023, 12:50 p.m. UTC | #4
> > +           // RAII type to destroy initialized elements.
> 
> There's only one initialized element, not "elements".
> 
> > +           struct _Guard_elts
> > +           {
> > +             pointer _M_first, _M_last;  // Elements to destroy
> 
> We only need to store one pointer here, call it _M_p.
> 
> > +             _Tp_alloc_type& _M_alloc;
> > +
> > +             _GLIBCXX20_CONSTEXPR
> > +             _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
> > +             : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
> > +             { }
> > +
> > +             _GLIBCXX20_CONSTEXPR
> > +             ~_Guard_elts()
> > +             { std::_Destroy(_M_first, _M_last, _M_alloc); }
> 
> This should be either:
> 
>   std::_Destroy(_M_p, _M_p+1, _M_alloc);
> 
> or avoid the loop that happens in that _Destroy function:
> 
>   _Alloc_traits::destroy(_M_alloc, _M_p);
> 
> > +
> > +           private:
> > +             _Guard_elts(const _Guard_elts&);
> > +           };
> > +
> > +           // Guard the new element so it will be destroyed if anything throws.
> > +           _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
> > +
> > +           __new_finish = std::__uninitialized_move_if_noexcept_a(
> > +                            __old_start, __old_finish,
> > +                            __new_start, _M_get_Tp_allocator());
> > +
> > +           ++__new_finish;
> > +           // Guard everything before the new element too.
> > +           __guard_elts._M_first = __new_start;
> 
> This seems redundant, we're not doing any more insertions now, and so
> this store is dead.

I am attaching patch with this check removed.  However I still think
Guard_elts needs to stay to be able to destroy the old ellements
> 
> > +
> > +           // New storage has been fully initialized, destroy the old elements.
> > +           __guard_elts._M_first = __old_start;
> > +           __guard_elts._M_last = __old_finish;
... here

Does it look better? (I am not really that confidend with libstdc++).

I also was wondering if we do have some data on what libstdc++ functions
are perofrmance critical in practice.  Given that the push_back can be
sped up very noticeably, I wonder if we don't have problems elsewhere?

Thank you,
Honza

diff --git a/libstdc++-v3/include/bits/stl_vector.h b/libstdc++-v3/include/bits/stl_vector.h
index 5e18f6eedce..973f4d7e2e9 100644
--- a/libstdc++-v3/include/bits/stl_vector.h
+++ b/libstdc++-v3/include/bits/stl_vector.h
@@ -1288,7 +1288,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 	    _GLIBCXX_ASAN_ANNOTATE_GREW(1);
 	  }
 	else
-	  _M_realloc_insert(end(), __x);
+	  _M_realloc_append(__x);
       }
 
 #if __cplusplus >= 201103L
@@ -1822,6 +1822,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 
       void
       _M_realloc_insert(iterator __position, const value_type& __x);
+
+      void
+      _M_realloc_append(const value_type& __x);
 #else
       // A value_type object constructed with _Alloc_traits::construct()
       // and destroyed with _Alloc_traits::destroy().
@@ -1871,6 +1874,11 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 	void
 	_M_realloc_insert(iterator __position, _Args&&... __args);
 
+      template<typename... _Args>
+	_GLIBCXX20_CONSTEXPR
+	void
+	_M_realloc_append(_Args&&... __args);
+
       // Either move-construct at the end, or forward to _M_insert_aux.
       _GLIBCXX20_CONSTEXPR
       iterator
diff --git a/libstdc++-v3/include/bits/vector.tcc b/libstdc++-v3/include/bits/vector.tcc
index 80631d1e2a1..0ccef7911b3 100644
--- a/libstdc++-v3/include/bits/vector.tcc
+++ b/libstdc++-v3/include/bits/vector.tcc
@@ -120,7 +120,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 	    _GLIBCXX_ASAN_ANNOTATE_GREW(1);
 	  }
 	else
-	  _M_realloc_insert(end(), std::forward<_Args>(__args)...);
+	  _M_realloc_append(std::forward<_Args>(__args)...);
 #if __cplusplus > 201402L
 	return back();
 #endif
@@ -459,6 +459,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 #endif
     {
       const size_type __len = _M_check_len(1u, "vector::_M_realloc_insert");
+      if (__len <= 0)
+	__builtin_unreachable ();
       pointer __old_start = this->_M_impl._M_start;
       pointer __old_finish = this->_M_impl._M_finish;
       const size_type __elems_before = __position - begin();
@@ -571,6 +573,127 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       this->_M_impl._M_end_of_storage = __new_start + __len;
     }
 
+#if __cplusplus >= 201103L
+  template<typename _Tp, typename _Alloc>
+    template<typename... _Args>
+      _GLIBCXX20_CONSTEXPR
+      void
+      vector<_Tp, _Alloc>::
+      _M_realloc_append(_Args&&... __args)
+#else
+  template<typename _Tp, typename _Alloc>
+    void
+    vector<_Tp, _Alloc>::
+    _M_realloc_append(const _Tp& __x)
+#endif
+    {
+      const size_type __len = _M_check_len(1u, "vector::_M_realloc_append");
+      if (__len <= 0)
+	__builtin_unreachable ();
+      pointer __old_start = this->_M_impl._M_start;
+      pointer __old_finish = this->_M_impl._M_finish;
+      const size_type __elems = end() - begin();
+      pointer __new_start(this->_M_allocate(__len));
+      pointer __new_finish(__new_start);
+
+      // RAII guard for allocated storage.
+      struct _Guard
+      {
+	pointer _M_storage;	    // Storage to deallocate
+	size_type _M_len;
+	_Tp_alloc_type& _M_alloc;
+
+	_GLIBCXX20_CONSTEXPR
+	_Guard(pointer __s, size_type __l, _Tp_alloc_type& __a)
+	: _M_storage(__s), _M_len(__l), _M_alloc(__a)
+	{ }
+
+	_GLIBCXX20_CONSTEXPR
+	~_Guard()
+	{
+	  if (_M_storage)
+	    __gnu_cxx::__alloc_traits<_Tp_alloc_type>::
+	      deallocate(_M_alloc, _M_storage, _M_len);
+	}
+
+      private:
+	_Guard(const _Guard&);
+      };
+
+      {
+	_Guard __guard(__new_start, __len, _M_impl);
+
+	// The order of the three operations is dictated by the C++11
+	// case, where the moves could alter a new element belonging
+	// to the existing vector.  This is an issue only for callers
+	// taking the element by lvalue ref (see last bullet of C++11
+	// [res.on.arguments]).
+
+	// If this throws, the existing elements are unchanged.
+#if __cplusplus >= 201103L
+	_Alloc_traits::construct(this->_M_impl,
+				 std::__to_address(__new_start + __elems),
+				 std::forward<_Args>(__args)...);
+#else
+	_Alloc_traits::construct(this->_M_impl,
+				 __new_start + __elems,
+				 __x);
+#endif
+
+#if __cplusplus >= 201103L
+	if _GLIBCXX17_CONSTEXPR (_S_use_relocate())
+	  {
+	    // Relocation cannot throw.
+	    __new_finish = _S_relocate(__old_start, __old_finish,
+				       __new_start, _M_get_Tp_allocator());
+	    ++__new_finish;
+	  }
+	else
+#endif
+	  {
+	    // RAII type to destroy initialized elements.
+	    struct _Guard_elts
+	    {
+	      pointer _M_first, _M_last;  // Elements to destroy
+	      _Tp_alloc_type& _M_alloc;
+
+	      _GLIBCXX20_CONSTEXPR
+	      _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
+	      : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
+	      { }
+
+	      _GLIBCXX20_CONSTEXPR
+	      ~_Guard_elts()
+	      { std::_Destroy(_M_first, _M_last, _M_alloc); }
+
+	    private:
+	      _Guard_elts(const _Guard_elts&);
+	    };
+
+	    // Guard the new element so it will be destroyed if anything throws.
+	    _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
+
+	    __new_finish = std::__uninitialized_move_if_noexcept_a(
+			     __old_start, __old_finish,
+			     __new_start, _M_get_Tp_allocator());
+
+	    ++__new_finish;
+
+	    // New storage has been fully initialized, destroy the old elements.
+	    __guard_elts._M_first = __old_start;
+	    __guard_elts._M_last = __old_finish;
+	  }
+	__guard._M_storage = __old_start;
+	__guard._M_len = this->_M_impl._M_end_of_storage - __old_start;
+      }
+      // deallocate should be called before assignments to _M_impl,
+      // to avoid call-clobbering
+
+      this->_M_impl._M_start = __new_start;
+      this->_M_impl._M_finish = __new_finish;
+      this->_M_impl._M_end_of_storage = __new_start + __len;
+    }
+
   template<typename _Tp, typename _Alloc>
     _GLIBCXX20_CONSTEXPR
     void
Jonathan Wakely Nov. 21, 2023, 1:07 p.m. UTC | #5
On Tue, 21 Nov 2023 at 12:50, Jan Hubicka <hubicka@ucw.cz> wrote:
>
> > > +           // RAII type to destroy initialized elements.
> >
> > There's only one initialized element, not "elements".
> >
> > > +           struct _Guard_elts
> > > +           {
> > > +             pointer _M_first, _M_last;  // Elements to destroy
> >
> > We only need to store one pointer here, call it _M_p.
> >
> > > +             _Tp_alloc_type& _M_alloc;
> > > +
> > > +             _GLIBCXX20_CONSTEXPR
> > > +             _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
> > > +             : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
> > > +             { }
> > > +
> > > +             _GLIBCXX20_CONSTEXPR
> > > +             ~_Guard_elts()
> > > +             { std::_Destroy(_M_first, _M_last, _M_alloc); }
> >
> > This should be either:
> >
> >   std::_Destroy(_M_p, _M_p+1, _M_alloc);
> >
> > or avoid the loop that happens in that _Destroy function:
> >
> >   _Alloc_traits::destroy(_M_alloc, _M_p);
> >
> > > +
> > > +           private:
> > > +             _Guard_elts(const _Guard_elts&);
> > > +           };
> > > +
> > > +           // Guard the new element so it will be destroyed if anything throws.
> > > +           _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
> > > +
> > > +           __new_finish = std::__uninitialized_move_if_noexcept_a(
> > > +                            __old_start, __old_finish,
> > > +                            __new_start, _M_get_Tp_allocator());
> > > +
> > > +           ++__new_finish;
> > > +           // Guard everything before the new element too.
> > > +           __guard_elts._M_first = __new_start;
> >
> > This seems redundant, we're not doing any more insertions now, and so
> > this store is dead.
>
> I am attaching patch with this check removed.  However I still think
> Guard_elts needs to stay to be able to destroy the old ellements
> >
> > > +
> > > +           // New storage has been fully initialized, destroy the old elements.
> > > +           __guard_elts._M_first = __old_start;
> > > +           __guard_elts._M_last = __old_finish;
> ... here
>
> Does it look better? (I am not really that confidend with libstdc++).

Yes, looks good, thanks.


>
> I also was wondering if we do have some data on what libstdc++ functions
> are perofrmance critical in practice.  Given that the push_back can be
> sped up very noticeably, I wonder if we don't have problems elsewhere?

We don't have that data, no.

It's possible that we could do similar things for insert(iterator pos,
...) for the case where pos==end(), i.e. inserting multiple elements
at the end.
>
> Thank you,
> Honza
>
> diff --git a/libstdc++-v3/include/bits/stl_vector.h b/libstdc++-v3/include/bits/stl_vector.h
> index 5e18f6eedce..973f4d7e2e9 100644
> --- a/libstdc++-v3/include/bits/stl_vector.h
> +++ b/libstdc++-v3/include/bits/stl_vector.h
> @@ -1288,7 +1288,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>             _GLIBCXX_ASAN_ANNOTATE_GREW(1);
>           }
>         else
> -         _M_realloc_insert(end(), __x);
> +         _M_realloc_append(__x);
>        }
>
>  #if __cplusplus >= 201103L
> @@ -1822,6 +1822,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>
>        void
>        _M_realloc_insert(iterator __position, const value_type& __x);
> +
> +      void
> +      _M_realloc_append(const value_type& __x);
>  #else
>        // A value_type object constructed with _Alloc_traits::construct()
>        // and destroyed with _Alloc_traits::destroy().
> @@ -1871,6 +1874,11 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>         void
>         _M_realloc_insert(iterator __position, _Args&&... __args);
>
> +      template<typename... _Args>
> +       _GLIBCXX20_CONSTEXPR
> +       void
> +       _M_realloc_append(_Args&&... __args);
> +
>        // Either move-construct at the end, or forward to _M_insert_aux.
>        _GLIBCXX20_CONSTEXPR
>        iterator
> diff --git a/libstdc++-v3/include/bits/vector.tcc b/libstdc++-v3/include/bits/vector.tcc
> index 80631d1e2a1..0ccef7911b3 100644
> --- a/libstdc++-v3/include/bits/vector.tcc
> +++ b/libstdc++-v3/include/bits/vector.tcc
> @@ -120,7 +120,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>             _GLIBCXX_ASAN_ANNOTATE_GREW(1);
>           }
>         else
> -         _M_realloc_insert(end(), std::forward<_Args>(__args)...);
> +         _M_realloc_append(std::forward<_Args>(__args)...);
>  #if __cplusplus > 201402L
>         return back();
>  #endif
> @@ -459,6 +459,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>  #endif
>      {
>        const size_type __len = _M_check_len(1u, "vector::_M_realloc_insert");
> +      if (__len <= 0)
> +       __builtin_unreachable ();
>        pointer __old_start = this->_M_impl._M_start;
>        pointer __old_finish = this->_M_impl._M_finish;
>        const size_type __elems_before = __position - begin();
> @@ -571,6 +573,127 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>        this->_M_impl._M_end_of_storage = __new_start + __len;
>      }
>
> +#if __cplusplus >= 201103L
> +  template<typename _Tp, typename _Alloc>
> +    template<typename... _Args>
> +      _GLIBCXX20_CONSTEXPR
> +      void
> +      vector<_Tp, _Alloc>::
> +      _M_realloc_append(_Args&&... __args)
> +#else
> +  template<typename _Tp, typename _Alloc>
> +    void
> +    vector<_Tp, _Alloc>::
> +    _M_realloc_append(const _Tp& __x)
> +#endif
> +    {
> +      const size_type __len = _M_check_len(1u, "vector::_M_realloc_append");
> +      if (__len <= 0)
> +       __builtin_unreachable ();
> +      pointer __old_start = this->_M_impl._M_start;
> +      pointer __old_finish = this->_M_impl._M_finish;
> +      const size_type __elems = end() - begin();
> +      pointer __new_start(this->_M_allocate(__len));
> +      pointer __new_finish(__new_start);
> +
> +      // RAII guard for allocated storage.
> +      struct _Guard
> +      {
> +       pointer _M_storage;         // Storage to deallocate
> +       size_type _M_len;
> +       _Tp_alloc_type& _M_alloc;
> +
> +       _GLIBCXX20_CONSTEXPR
> +       _Guard(pointer __s, size_type __l, _Tp_alloc_type& __a)
> +       : _M_storage(__s), _M_len(__l), _M_alloc(__a)
> +       { }
> +
> +       _GLIBCXX20_CONSTEXPR
> +       ~_Guard()
> +       {
> +         if (_M_storage)
> +           __gnu_cxx::__alloc_traits<_Tp_alloc_type>::
> +             deallocate(_M_alloc, _M_storage, _M_len);
> +       }
> +
> +      private:
> +       _Guard(const _Guard&);
> +      };
> +
> +      {
> +       _Guard __guard(__new_start, __len, _M_impl);
> +
> +       // The order of the three operations is dictated by the C++11
> +       // case, where the moves could alter a new element belonging
> +       // to the existing vector.  This is an issue only for callers
> +       // taking the element by lvalue ref (see last bullet of C++11
> +       // [res.on.arguments]).
> +
> +       // If this throws, the existing elements are unchanged.
> +#if __cplusplus >= 201103L
> +       _Alloc_traits::construct(this->_M_impl,
> +                                std::__to_address(__new_start + __elems),
> +                                std::forward<_Args>(__args)...);
> +#else
> +       _Alloc_traits::construct(this->_M_impl,
> +                                __new_start + __elems,
> +                                __x);
> +#endif
> +
> +#if __cplusplus >= 201103L
> +       if _GLIBCXX17_CONSTEXPR (_S_use_relocate())
> +         {
> +           // Relocation cannot throw.
> +           __new_finish = _S_relocate(__old_start, __old_finish,
> +                                      __new_start, _M_get_Tp_allocator());
> +           ++__new_finish;
> +         }
> +       else
> +#endif
> +         {
> +           // RAII type to destroy initialized elements.
> +           struct _Guard_elts
> +           {
> +             pointer _M_first, _M_last;  // Elements to destroy
> +             _Tp_alloc_type& _M_alloc;
> +
> +             _GLIBCXX20_CONSTEXPR
> +             _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
> +             : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
> +             { }
> +
> +             _GLIBCXX20_CONSTEXPR
> +             ~_Guard_elts()
> +             { std::_Destroy(_M_first, _M_last, _M_alloc); }
> +
> +           private:
> +             _Guard_elts(const _Guard_elts&);
> +           };
> +
> +           // Guard the new element so it will be destroyed if anything throws.
> +           _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
> +
> +           __new_finish = std::__uninitialized_move_if_noexcept_a(
> +                            __old_start, __old_finish,
> +                            __new_start, _M_get_Tp_allocator());
> +
> +           ++__new_finish;
> +
> +           // New storage has been fully initialized, destroy the old elements.
> +           __guard_elts._M_first = __old_start;
> +           __guard_elts._M_last = __old_finish;
> +         }
> +       __guard._M_storage = __old_start;
> +       __guard._M_len = this->_M_impl._M_end_of_storage - __old_start;
> +      }
> +      // deallocate should be called before assignments to _M_impl,
> +      // to avoid call-clobbering
> +
> +      this->_M_impl._M_start = __new_start;
> +      this->_M_impl._M_finish = __new_finish;
> +      this->_M_impl._M_end_of_storage = __new_start + __len;
> +    }
> +
>    template<typename _Tp, typename _Alloc>
>      _GLIBCXX20_CONSTEXPR
>      void
>
Matthias Kretz Nov. 23, 2023, 8:15 a.m. UTC | #6
On Sunday, 19 November 2023 22:53:37 CET Jan Hubicka wrote:
> Sadly it is really hard to work out this
> from IPA passes, since we basically care whether the iterator points to
> the same place as the end pointer, which are both passed by reference.
> This is inter-procedural value numbering that is quite out of reach.

I've done a fair share of branching on __builtin_constant_p in 
std::experimental::simd to improve code-gen. It's powerful! But maybe we 
also need the other side of the story to tell the optimizer: "I know you 
can't const-prop everything; but this variable / expression, even if you 
need to put in a lot of effort, the performance difference will be worth 
it."

For std::vector, the remaining capacity could be such a value. The 
functions f() and g() are equivalent (their code-gen isn't https://
compiler-explorer.com/z/r44ejK1qz):

#include <vector>

auto
f()
{
  std::vector<int> x;
  x.reserve(10);
  for (int i = 0; i < 10; ++i)
    x.push_back(0);
  return x;
}

auto
g()
{ return std::vector<int>(10, 0); }
Jan Hubicka Nov. 23, 2023, 3:07 p.m. UTC | #7
> On Sunday, 19 November 2023 22:53:37 CET Jan Hubicka wrote:
> > Sadly it is really hard to work out this
> > from IPA passes, since we basically care whether the iterator points to
> > the same place as the end pointer, which are both passed by reference.
> > This is inter-procedural value numbering that is quite out of reach.
> 
> I've done a fair share of branching on __builtin_constant_p in 
> std::experimental::simd to improve code-gen. It's powerful! But maybe we 
> also need the other side of the story to tell the optimizer: "I know you 
> can't const-prop everything; but this variable / expression, even if you 
> need to put in a lot of effort, the performance difference will be worth 
> it."
> 
> For std::vector, the remaining capacity could be such a value. The 
> functions f() and g() are equivalent (their code-gen isn't https://
> compiler-explorer.com/z/r44ejK1qz):
> 
> #include <vector>
> 
> auto
> f()
> {
>   std::vector<int> x;
>   x.reserve(10);
>   for (int i = 0; i < 10; ++i)
>     x.push_back(0);
>   return x;
> }
> auto
> g()
> { return std::vector<int>(10, 0); }

With my changes at -O3 we now inline push_back, so we could optimize the
first loop to the second. However with 
~/trunk-install/bin/gcc -O3  auto.C  -S -fdump-tree-all-details -fno-exceptions -fno-store-merging -fno-tree-slp-vectorize
the fist problem is right at the begining:

  <bb 2> [local count: 97603128]:
  MEM[(struct _Vector_impl_data *)x_4(D)]._M_start = 0B;
  MEM[(struct _Vector_impl_data *)x_4(D)]._M_finish = 0B;
  MEM[(struct _Vector_impl_data *)x_4(D)]._M_end_of_storage = 0B;
  _37 = operator new (40);
  _22 = x_4(D)->D.26019._M_impl.D.25320._M_finish;
  _23 = x_4(D)->D.26019._M_impl.D.25320._M_start;
  _24 = _22 - _23;
  if (_24 > 0)
    goto <bb 3>; [41.48%]
  else
    goto <bb 4>; [58.52%]

So the vector is fist initialized with _M_start=_M_finish=0, but after
call to new we already are not able to propagate this.

This is because x is returned and PTA considers it escaping.  This is
problem discussed in 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112653
Which shows that it is likely worthwhile to fix PTA to handle this
correctly.
Jan Hubicka Nov. 23, 2023, 3:33 p.m. UTC | #8
> > On Sunday, 19 November 2023 22:53:37 CET Jan Hubicka wrote:
> > > Sadly it is really hard to work out this
> > > from IPA passes, since we basically care whether the iterator points to
> > > the same place as the end pointer, which are both passed by reference.
> > > This is inter-procedural value numbering that is quite out of reach.
> > 
> > I've done a fair share of branching on __builtin_constant_p in 
> > std::experimental::simd to improve code-gen. It's powerful! But maybe we 
> > also need the other side of the story to tell the optimizer: "I know you 
> > can't const-prop everything; but this variable / expression, even if you 
> > need to put in a lot of effort, the performance difference will be worth 
> > it."
> > 
> > For std::vector, the remaining capacity could be such a value. The 
> > functions f() and g() are equivalent (their code-gen isn't https://
> > compiler-explorer.com/z/r44ejK1qz):
> > 
> > #include <vector>
> > 
> > auto
> > f()
> > {
> >   std::vector<int> x;
> >   x.reserve(10);
> >   for (int i = 0; i < 10; ++i)
> >     x.push_back(0);
> >   return x;
> > }
> > auto
> > g()
> > { return std::vector<int>(10, 0); }
> 
> With my changes at -O3 we now inline push_back, so we could optimize the
> first loop to the second. However with 
> ~/trunk-install/bin/gcc -O3  auto.C  -S -fdump-tree-all-details -fno-exceptions -fno-store-merging -fno-tree-slp-vectorize
> the fist problem is right at the begining:
> 
>   <bb 2> [local count: 97603128]:
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_start = 0B;
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_finish = 0B;
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_end_of_storage = 0B;
>   _37 = operator new (40);

I also wonder, if default operator new and malloc can be handled as not
reading/modifying anything visible to the user code.  That would help
us to propagate here even if we lose track of points-to information.

We have:

  /* If the call is to a replaceable operator delete and results
     from a delete expression as opposed to a direct call to
     such operator, then we can treat it as free.  */
  if (fndecl
      && DECL_IS_OPERATOR_DELETE_P (fndecl)
      && DECL_IS_REPLACEABLE_OPERATOR (fndecl)
      && gimple_call_from_new_or_delete (stmt))
    return ". o ";
  /* Similarly operator new can be treated as malloc.  */
  if (fndecl
      && DECL_IS_REPLACEABLE_OPERATOR_NEW_P (fndecl)
      && gimple_call_from_new_or_delete (stmt))
    return "m ";
Which informs alias analysis that new returns pointer to memory
not aliasing with anything and that free is not reading anything
from its parameter (but it is modelled as a write to make it clear
that the memory dies).

stmt_kills_ref_p special cases BUILT_IN_FREE but not OPERATOR delete
to make it clear that everything pointed to by it dies.   This is needed
because 'o' only means that some data may be overwritten, but it does
not make it clear that all data dies.

Not handling operator delete seems like an omision, but maybe it is not
too critical since we emit clobbers around destructors that are usually
right before call to delete.  Also ipa-modref kill analysis does not
understand BUILT_IN_FREE nor delete and could.

I wonder if we can handle both as const except for side-effects
described.

Honza
>   _22 = x_4(D)->D.26019._M_impl.D.25320._M_finish;
>   _23 = x_4(D)->D.26019._M_impl.D.25320._M_start;
>   _24 = _22 - _23;
>   if (_24 > 0)
>     goto <bb 3>; [41.48%]
>   else
>     goto <bb 4>; [58.52%]
> 
> So the vector is fist initialized with _M_start=_M_finish=0, but after
> call to new we already are not able to propagate this.
> 
> This is because x is returned and PTA considers it escaping.  This is
> problem discussed in 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112653
> Which shows that it is likely worthwhile to fix PTA to handle this
> correctly.
Jan Hubicka Nov. 23, 2023, 3:43 p.m. UTC | #9
Hi,
so if I understand it right, it should be safe to simply replace memmove
by memcpy.  I wonder if we can get rid of the count != 0 check at least
for glibc systems.  In general push_back now need inline-insns-auto to
be 33 to be inlined at -O2


jh@ryzen4:/tmp> cat ~/tt.C
#include <vector>
typedef unsigned int uint32_t;
struct pair_t {uint32_t first, second;};
struct pair_t pair;
void
test()
{
        std::vector<pair_t> stack;
        stack.push_back (pair);
        while (!stack.empty()) {
                pair_t cur = stack.back();
                stack.pop_back();
                if (!cur.first)
                {
                        cur.second++;
                        stack.push_back (cur);
                }
                if (cur.second > 10000)
                        break;
        }
}
int
main()
{
        for (int i = 0; i < 10000; i++)
          test();
}

jh@ryzen4:/tmp> ~/trunk-install/bin/g++ ~/tt.C -O2 --param max-inline-insns-auto=32 ; time ./a.out

real    0m0.399s
user    0m0.399s
sys     0m0.000s
jh@ryzen4:/tmp> ~/trunk-install/bin/g++ ~/tt.C -O2 --param max-inline-insns-auto=33 ; time ./a.out

real    0m0.039s
user    0m0.039s
sys     0m0.000s

Current inline limit is 15. We can save
 - 2 insns if inliner knows that conditional guarding
   builtin_unreachable will die (I have patch for this)
 - 4 isnsn if we work out that on 64bit hosts allocating vector with
   2^63 elements is impossible
 - 2 insns if we allow NULL parameter on memcpy
 - 2 insns if we allos NULL parameter on delete
So thi is 23 instructions. Inliner has hinting which could make
push_back reasonable candidate for -O2 inlining and then we could be
able to propagate interesitng stuff across repeated calls to push_back.

libstdc++-v3/ChangeLog:

	* include/bits/stl_uninitialized.h (relocate_a_1): Use memcpy instead of memmove.

diff --git a/libstdc++-v3/include/bits/stl_uninitialized.h b/libstdc++-v3/include/bits/stl_uninitialized.h
index 1282af3bc43..a9b802774c6 100644
--- a/libstdc++-v3/include/bits/stl_uninitialized.h
+++ b/libstdc++-v3/include/bits/stl_uninitialized.h
@@ -1119,14 +1119,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #ifdef __cpp_lib_is_constant_evaluated
 	  if (std::is_constant_evaluated())
 	    {
-	      // Can't use memmove. Wrap the pointer so that __relocate_a_1
+	      // Can't use memcpy. Wrap the pointer so that __relocate_a_1
 	      // resolves to the non-trivial overload above.
 	      __gnu_cxx::__normal_iterator<_Tp*, void> __out(__result);
 	      __out = std::__relocate_a_1(__first, __last, __out, __alloc);
 	      return __out.base();
 	    }
 #endif
-	  __builtin_memmove(__result, __first, __count * sizeof(_Tp));
+	  __builtin_memcpy(__result, __first, __count * sizeof(_Tp));
 	}
       return __result + __count;
     }
Jonathan Wakely Nov. 23, 2023, 4:20 p.m. UTC | #10
On Thu, 23 Nov 2023 at 15:34, Jan Hubicka <hubicka@ucw.cz> wrote:
>
> > > On Sunday, 19 November 2023 22:53:37 CET Jan Hubicka wrote:
> > > > Sadly it is really hard to work out this
> > > > from IPA passes, since we basically care whether the iterator points to
> > > > the same place as the end pointer, which are both passed by reference.
> > > > This is inter-procedural value numbering that is quite out of reach.
> > >
> > > I've done a fair share of branching on __builtin_constant_p in
> > > std::experimental::simd to improve code-gen. It's powerful! But maybe we
> > > also need the other side of the story to tell the optimizer: "I know you
> > > can't const-prop everything; but this variable / expression, even if you
> > > need to put in a lot of effort, the performance difference will be worth
> > > it."
> > >
> > > For std::vector, the remaining capacity could be such a value. The
> > > functions f() and g() are equivalent (their code-gen isn't https://
> > > compiler-explorer.com/z/r44ejK1qz):
> > >
> > > #include <vector>
> > >
> > > auto
> > > f()
> > > {
> > >   std::vector<int> x;
> > >   x.reserve(10);
> > >   for (int i = 0; i < 10; ++i)
> > >     x.push_back(0);
> > >   return x;
> > > }
> > > auto
> > > g()
> > > { return std::vector<int>(10, 0); }
> >
> > With my changes at -O3 we now inline push_back, so we could optimize the
> > first loop to the second. However with
> > ~/trunk-install/bin/gcc -O3  auto.C  -S -fdump-tree-all-details -fno-exceptions -fno-store-merging -fno-tree-slp-vectorize
> > the fist problem is right at the begining:
> >
> >   <bb 2> [local count: 97603128]:
> >   MEM[(struct _Vector_impl_data *)x_4(D)]._M_start = 0B;
> >   MEM[(struct _Vector_impl_data *)x_4(D)]._M_finish = 0B;
> >   MEM[(struct _Vector_impl_data *)x_4(D)]._M_end_of_storage = 0B;
> >   _37 = operator new (40);
>
> I also wonder, if default operator new and malloc can be handled as not
> reading/modifying anything visible to the user code.

No, there's no way to know if the default operator new is being used.
A replacement operator new could be provided at link-time.

That's why we need -fsane-operator-new

> That would help
> us to propagate here even if we lose track of points-to information.
>
> We have:
>
>   /* If the call is to a replaceable operator delete and results
>      from a delete expression as opposed to a direct call to
>      such operator, then we can treat it as free.  */
>   if (fndecl
>       && DECL_IS_OPERATOR_DELETE_P (fndecl)
>       && DECL_IS_REPLACEABLE_OPERATOR (fndecl)
>       && gimple_call_from_new_or_delete (stmt))
>     return ". o ";
>   /* Similarly operator new can be treated as malloc.  */
>   if (fndecl
>       && DECL_IS_REPLACEABLE_OPERATOR_NEW_P (fndecl)
>       && gimple_call_from_new_or_delete (stmt))
>     return "m ";
> Which informs alias analysis that new returns pointer to memory
> not aliasing with anything and that free is not reading anything
> from its parameter (but it is modelled as a write to make it clear
> that the memory dies).

But this only applies to new T[n] not to operator new(n * sizeof(T)).
So it's not relevant to std::vector.

> stmt_kills_ref_p special cases BUILT_IN_FREE but not OPERATOR delete
> to make it clear that everything pointed to by it dies.   This is needed
> because 'o' only means that some data may be overwritten, but it does
> not make it clear that all data dies.
>
> Not handling operator delete seems like an omision, but maybe it is not
> too critical since we emit clobbers around destructors that are usually
> right before call to delete.  Also ipa-modref kill analysis does not
> understand BUILT_IN_FREE nor delete and could.
>
> I wonder if we can handle both as const except for side-effects
> described.
>
> Honza
> >   _22 = x_4(D)->D.26019._M_impl.D.25320._M_finish;
> >   _23 = x_4(D)->D.26019._M_impl.D.25320._M_start;
> >   _24 = _22 - _23;
> >   if (_24 > 0)
> >     goto <bb 3>; [41.48%]
> >   else
> >     goto <bb 4>; [58.52%]
> >
> > So the vector is fist initialized with _M_start=_M_finish=0, but after
> > call to new we already are not able to propagate this.
> >
> > This is because x is returned and PTA considers it escaping.  This is
> > problem discussed in
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112653
> > Which shows that it is likely worthwhile to fix PTA to handle this
> > correctly.
>
Jonathan Wakely Nov. 23, 2023, 4:26 p.m. UTC | #11
On Thu, 23 Nov 2023 at 15:44, Jan Hubicka <hubicka@ucw.cz> wrote:
>
> Hi,
> so if I understand it right, it should be safe to simply replace memmove
> by memcpy.  I wonder if we can get rid of the count != 0 check at least
> for glibc systems.

I don't think we can do that. It's still undefined with glibc, and
glibc marks it with __attribute__((nonnull)), and ubsan will diagnose
it.

>  In general push_back now need inline-insns-auto to
> be 33 to be inlined at -O2
>
>
> jh@ryzen4:/tmp> cat ~/tt.C
> #include <vector>
> typedef unsigned int uint32_t;
> struct pair_t {uint32_t first, second;};
> struct pair_t pair;
> void
> test()
> {
>         std::vector<pair_t> stack;
>         stack.push_back (pair);
>         while (!stack.empty()) {
>                 pair_t cur = stack.back();
>                 stack.pop_back();
>                 if (!cur.first)
>                 {
>                         cur.second++;
>                         stack.push_back (cur);
>                 }
>                 if (cur.second > 10000)
>                         break;
>         }
> }
> int
> main()
> {
>         for (int i = 0; i < 10000; i++)
>           test();
> }
>
> jh@ryzen4:/tmp> ~/trunk-install/bin/g++ ~/tt.C -O2 --param max-inline-insns-auto=32 ; time ./a.out
>
> real    0m0.399s
> user    0m0.399s
> sys     0m0.000s
> jh@ryzen4:/tmp> ~/trunk-install/bin/g++ ~/tt.C -O2 --param max-inline-insns-auto=33 ; time ./a.out
>
> real    0m0.039s
> user    0m0.039s
> sys     0m0.000s
>
> Current inline limit is 15. We can save
>  - 2 insns if inliner knows that conditional guarding
>    builtin_unreachable will die (I have patch for this)
>  - 4 isnsn if we work out that on 64bit hosts allocating vector with
>    2^63 elements is impossible
>  - 2 insns if we allow NULL parameter on memcpy

I don't think we can do that.

>  - 2 insns if we allos NULL parameter on delete

That's allowed, I think we just check first to avoid making a function
call if it's null, because we know operator delete will do nothing.

But if it's hurting inlining, maybe that's the wrong choice.

> So thi is 23 instructions. Inliner has hinting which could make
> push_back reasonable candidate for -O2 inlining and then we could be
> able to propagate interesitng stuff across repeated calls to push_back.
>
> libstdc++-v3/ChangeLog:
>
>         * include/bits/stl_uninitialized.h (relocate_a_1): Use memcpy instead of memmove.

This patch is OK for trunk.

>
> diff --git a/libstdc++-v3/include/bits/stl_uninitialized.h b/libstdc++-v3/include/bits/stl_uninitialized.h
> index 1282af3bc43..a9b802774c6 100644
> --- a/libstdc++-v3/include/bits/stl_uninitialized.h
> +++ b/libstdc++-v3/include/bits/stl_uninitialized.h
> @@ -1119,14 +1119,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>  #ifdef __cpp_lib_is_constant_evaluated
>           if (std::is_constant_evaluated())
>             {
> -             // Can't use memmove. Wrap the pointer so that __relocate_a_1
> +             // Can't use memcpy. Wrap the pointer so that __relocate_a_1
>               // resolves to the non-trivial overload above.
>               __gnu_cxx::__normal_iterator<_Tp*, void> __out(__result);
>               __out = std::__relocate_a_1(__first, __last, __out, __alloc);
>               return __out.base();
>             }
>  #endif
> -         __builtin_memmove(__result, __first, __count * sizeof(_Tp));
> +         __builtin_memcpy(__result, __first, __count * sizeof(_Tp));
>         }
>        return __result + __count;
>      }
>
Martin Jambor Nov. 24, 2023, 10:21 a.m. UTC | #12
Hello,

On Thu, Nov 23 2023, Jonathan Wakely wrote:
> On Thu, 23 Nov 2023 at 15:34, Jan Hubicka <hubicka@ucw.cz> wrote:
>>

[...]

>>
>> I also wonder, if default operator new and malloc can be handled as not
>> reading/modifying anything visible to the user code.
>
> No, there's no way to know if the default operator new is being used.
> A replacement operator new could be provided at link-time.
>
> That's why we need -fsane-operator-new
>

Would it make sense to add -fsane-operator-new to -Ofast?

Martin
Richard Biener Nov. 24, 2023, 10:23 a.m. UTC | #13
On Fri, 24 Nov 2023, Martin Jambor wrote:

> Hello,
> 
> On Thu, Nov 23 2023, Jonathan Wakely wrote:
> > On Thu, 23 Nov 2023 at 15:34, Jan Hubicka <hubicka@ucw.cz> wrote:
> >>
> 
> [...]
> 
> >>
> >> I also wonder, if default operator new and malloc can be handled as not
> >> reading/modifying anything visible to the user code.
> >
> > No, there's no way to know if the default operator new is being used.
> > A replacement operator new could be provided at link-time.
> >
> > That's why we need -fsane-operator-new
> >
> 
> Would it make sense to add -fsane-operator-new to -Ofast?

Yes, but somebody needs to implement it first ;)  And
define the constraints it imposes (also on operator delete?).

Richard.
Marc Glisse Nov. 24, 2023, 7:45 p.m. UTC | #14
On Thu, 23 Nov 2023, Jonathan Wakely wrote:

> That's why we need -fsane-operator-new

Although the standard forbids it, some of us just provide an inline 
implementation

inline void* operator new(std::size_t n){return malloc(n);}
inline void operator delete(void*p)noexcept{free(p);}
inline void operator delete(void*p,std::size_t)noexcept{free(p);}

(I could certainly add a check to abort if malloc returns 0 or other 
details, depending on what the application calls for)

It used to enable a number of optimizations, for instance in gcc-9

auto f(){ return std::vector<int>(4096); }

was optimized to just one call to calloc (someone broke that in gcc-10).

Using LTO on libsupc++ is related.

I don't know if we want to define "sane" operators new/delete, or just 
have a flag that promises that we won't try to replace the default ones.
Jan Hubicka Nov. 24, 2023, 8:07 p.m. UTC | #15
> With my changes at -O3 we now inline push_back, so we could optimize the
> first loop to the second. However with 
> ~/trunk-install/bin/gcc -O3  auto.C  -S -fdump-tree-all-details -fno-exceptions -fno-store-merging -fno-tree-slp-vectorize
> the fist problem is right at the begining:
> 
>   <bb 2> [local count: 97603128]:
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_start = 0B;
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_finish = 0B;
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_end_of_storage = 0B;
>   _37 = operator new (40);
>   _22 = x_4(D)->D.26019._M_impl.D.25320._M_finish;
>   _23 = x_4(D)->D.26019._M_impl.D.25320._M_start;

Thinking of this problem, it is easy to adjust reserve to copy _M_start
and _M_finish to local variables across the call of new() which makes
the old values visible to compiler regardless of points-to-analysis.
In fact _M_realloc_insert already has such code:

	  // Make local copies of these members because the compiler thinks
	  // the allocator can alter them if 'this' is globally reachable.
	  pointer __old_start = this->_M_impl._M_start;
	  pointer __old_finish = this->_M_impl._M_finish;

So attached patch does that in reserve.  The downside is that if things
are not inlined we may end up pushing extra copy to stack, but I believe
the benefit from inlining actually pays this back.

The testcase with loop still does not optimize it so I simplified it:

#include <vector>

auto
f()
{
  std::vector<int> x;
  x.reserve(10);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  return x;
}

auto                                                                                                                                                                                                                                                                   
g()                                                                                                                                                                                                                                                                    
{ return std::vector<int>(10, 0); }                                                                                                                                                                                                                                    

This now compiles to less code but it is somewhat funny:

  <bb 2> [local count: 1073741824]:
  MEM <vector(2) long unsigned int> [(int * *)x_3(D)] = { 0, 0 };
  MEM[(struct _Vector_impl_data *)x_3(D)]._M_end_of_storage = 0B;
  _70 = operator new (40);

  <bb 3> [local count: 1073741824]:
  x_3(D)->D.26024._M_impl.D.25325._M_start = _70;
  _65 = _70 + 40;
  x_3(D)->D.26024._M_impl.D.25325._M_end_of_storage = _65;
  _74 = _70 + 4;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _74;
  MEM <unsigned long> [(int *)_70] = 0;
  _80 = _70 + 8;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _80;
  *_80 = 0;
  _86 = _70 + 12;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _86;
  *_86 = 0;
  _92 = _70 + 16;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _92;
  *_92 = 0;
  _98 = _70 + 20;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _98;
  *_98 = 0;
  _104 = _70 + 24;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _104;
  *_104 = 0;
  _110 = _70 + 28;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _110;
  *_110 = 0;
  _116 = _70 + 32;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _116;
  *_116 = 0;
  _122 = _70 + 36;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _122;
  return x_3(D);

The setup code in BB2 is useless and due to fake escape, as discussed in
PRR112653.

However it is funny that we miss dead store elimintation and repeately
set x_3(D)->D.26024._M_impl.D.25325._M_finish = _104;

The problem here is that until pushback.C.208t.forwprop4:std::vector we
do not optimize the reallocation code:

  <bb 2> [local count: 1073741824]:
  MEM <vector(2) long unsigned int> [(int * *)x_3(D)] = { 0, 0 };
  MEM[(struct _Vector_impl_data *)x_3(D)]._M_end_of_storage = 0B;
  _70 = operator new (40);

  <bb 3> [local count: 1073741824]:
  x_3(D)->D.26024._M_impl.D.25325._M_start = _70;
  _65 = _70 + 40;
  x_3(D)->D.26024._M_impl.D.25325._M_end_of_storage = _65;
  *_70 = 0;
  _74 = _70 + 4;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _74;
  D.26115 = 0;
  if (_65 != _74)
    goto <bb 4>; [82.57%]
  else
    goto <bb 5>; [17.43%]

  <bb 4> [local count: 886588624]:
  *_74 = 0;
  _80 = _70 + 8;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _80;
  goto <bb 8>; [100.00%]

  <bb 5> [local count: 187153200]:
  std::vector<int>::_M_realloc_append<int> (x_3(D), &D.26115);
  goto <bb 7>; [100.00%]

  <bb 6> [count: 0]:
<L12>:
  goto <bb 44>; [100.00%]

  <bb 7> [local count: 187153200]:
  pretmp_127 = MEM[(int * const &)x_3(D) + 8];
  pretmp_144 = MEM[(struct _Vector_base *)x_3(D)]._M_impl.D.25325._M_end_of_storage;

And after forwprop4 it is too late because DSE is no longer run.
I filled PR112706. For some reason FRE is missing to optimize:

int *ptr;
void link_error ();
void
test ()
{

        int *ptr1 = ptr + 10;
        int *ptr2 = ptr + 20;
        if (ptr1 == ptr2)
                link_error ();
}


The vector.tcc change was regtested on x86_64-linux, OK?

libstdc++-v3/ChangeLog:

	* include/bits/vector.tcc (reserve): Copy _M_start and _M_finish
	to local variables to allow propagation across call to
	allocator.

diff --git a/libstdc++-v3/include/bits/vector.tcc b/libstdc++-v3/include/bits/vector.tcc
index 0ccef7911b3..0a9db29c1c7 100644
--- a/libstdc++-v3/include/bits/vector.tcc
+++ b/libstdc++-v3/include/bits/vector.tcc
@@ -72,27 +72,30 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       if (this->capacity() < __n)
 	{
 	  const size_type __old_size = size();
+	  // Make local copies of these members because the compiler thinks
+	  // the allocator can alter them if 'this' is globally reachable.
+	  pointer __old_start = this->_M_impl._M_start;
+	  pointer __old_finish = this->_M_impl._M_finish;
 	  pointer __tmp;
 #if __cplusplus >= 201103L
 	  if _GLIBCXX17_CONSTEXPR (_S_use_relocate())
 	    {
 	      __tmp = this->_M_allocate(__n);
-	      _S_relocate(this->_M_impl._M_start, this->_M_impl._M_finish,
+	      _S_relocate(__old_start, __old_finish,
 			  __tmp, _M_get_Tp_allocator());
 	    }
 	  else
 #endif
 	    {
 	      __tmp = _M_allocate_and_copy(__n,
-		_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_start),
-		_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_finish));
-	      std::_Destroy(this->_M_impl._M_start, this->_M_impl._M_finish,
+		_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(__old_start),
+		_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(__old_finish));
+	      std::_Destroy(__old_start, __old_finish,
 			    _M_get_Tp_allocator());
 	    }
 	  _GLIBCXX_ASAN_ANNOTATE_REINIT;
-	  _M_deallocate(this->_M_impl._M_start,
-			this->_M_impl._M_end_of_storage
-			- this->_M_impl._M_start);
+	  _M_deallocate(__old_start,
+			this->_M_impl._M_end_of_storage - __old_finish);
 	  this->_M_impl._M_start = __tmp;
 	  this->_M_impl._M_finish = __tmp + __old_size;
 	  this->_M_impl._M_end_of_storage = this->_M_impl._M_start + __n;
Jonathan Wakely Nov. 24, 2023, 9:55 p.m. UTC | #16
On Fri, 24 Nov 2023 at 20:07, Jan Hubicka <hubicka@ucw.cz> wrote:
> The vector.tcc change was regtested on x86_64-linux, OK?
>
> libstdc++-v3/ChangeLog:
>
>         * include/bits/vector.tcc (reserve): Copy _M_start and _M_finish
>         to local variables to allow propagation across call to
>         allocator.
>
> diff --git a/libstdc++-v3/include/bits/vector.tcc b/libstdc++-v3/include/bits/vector.tcc
> index 0ccef7911b3..0a9db29c1c7 100644
> --- a/libstdc++-v3/include/bits/vector.tcc
> +++ b/libstdc++-v3/include/bits/vector.tcc
> @@ -72,27 +72,30 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>        if (this->capacity() < __n)
>         {
>           const size_type __old_size = size();
> +         // Make local copies of these members because the compiler thinks
> +         // the allocator can alter them if 'this' is globally reachable.
> +         pointer __old_start = this->_M_impl._M_start;
> +         pointer __old_finish = this->_M_impl._M_finish;
>           pointer __tmp;
>  #if __cplusplus >= 201103L
>           if _GLIBCXX17_CONSTEXPR (_S_use_relocate())
>             {
>               __tmp = this->_M_allocate(__n);
> -             _S_relocate(this->_M_impl._M_start, this->_M_impl._M_finish,
> +             _S_relocate(__old_start, __old_finish,
>                           __tmp, _M_get_Tp_allocator());

Please move "__tmp," to the first line, as it will fit there now that
the line got shorter:

          _S_relocate(__old_start, __old_finish, __tmp,
              _M_get_Tp_allocator());


>             }
>           else
>  #endif
>             {
>               __tmp = _M_allocate_and_copy(__n,
> -               _GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_start),
> -               _GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_finish));
> -             std::_Destroy(this->_M_impl._M_start, this->_M_impl._M_finish,
> +               _GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(__old_start),
> +               _GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(__old_finish));
> +             std::_Destroy(__old_start, __old_finish,
>                             _M_get_Tp_allocator());

The _Destroy call will fit on one line now:

          std::_Destroy(__old_start, __old_finish, _M_get_Tp_allocator());



>             }
>           _GLIBCXX_ASAN_ANNOTATE_REINIT;
> -         _M_deallocate(this->_M_impl._M_start,
> -                       this->_M_impl._M_end_of_storage
> -                       - this->_M_impl._M_start);
> +         _M_deallocate(__old_start,
> +                       this->_M_impl._M_end_of_storage - __old_finish);

This should be __old_start.

I think what you have here will show Asan and/or valgrind errors, as
the wrong length will be passed to operator delete.


>           this->_M_impl._M_start = __tmp;
>           this->_M_impl._M_finish = __tmp + __old_size;
>           this->_M_impl._M_end_of_storage = this->_M_impl._M_start + __n;
>
diff mbox series

Patch

diff --git a/libstdc++-v3/include/bits/stl_vector.h b/libstdc++-v3/include/bits/stl_vector.h
index 5e18f6eedce..973f4d7e2e9 100644
--- a/libstdc++-v3/include/bits/stl_vector.h
+++ b/libstdc++-v3/include/bits/stl_vector.h
@@ -1288,7 +1288,7 @@  _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 	    _GLIBCXX_ASAN_ANNOTATE_GREW(1);
 	  }
 	else
-	  _M_realloc_insert(end(), __x);
+	  _M_realloc_append(__x);
       }
 
 #if __cplusplus >= 201103L
@@ -1822,6 +1822,9 @@  _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 
       void
       _M_realloc_insert(iterator __position, const value_type& __x);
+
+      void
+      _M_realloc_append(const value_type& __x);
 #else
       // A value_type object constructed with _Alloc_traits::construct()
       // and destroyed with _Alloc_traits::destroy().
@@ -1871,6 +1874,11 @@  _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 	void
 	_M_realloc_insert(iterator __position, _Args&&... __args);
 
+      template<typename... _Args>
+	_GLIBCXX20_CONSTEXPR
+	void
+	_M_realloc_append(_Args&&... __args);
+
       // Either move-construct at the end, or forward to _M_insert_aux.
       _GLIBCXX20_CONSTEXPR
       iterator
diff --git a/libstdc++-v3/include/bits/vector.tcc b/libstdc++-v3/include/bits/vector.tcc
index 80631d1e2a1..1306676e795 100644
--- a/libstdc++-v3/include/bits/vector.tcc
+++ b/libstdc++-v3/include/bits/vector.tcc
@@ -120,7 +120,7 @@  _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 	    _GLIBCXX_ASAN_ANNOTATE_GREW(1);
 	  }
 	else
-	  _M_realloc_insert(end(), std::forward<_Args>(__args)...);
+	  _M_realloc_append(std::forward<_Args>(__args)...);
 #if __cplusplus > 201402L
 	return back();
 #endif
@@ -459,6 +459,8 @@  _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 #endif
     {
       const size_type __len = _M_check_len(1u, "vector::_M_realloc_insert");
+      if (__len <= 0)
+	__builtin_unreachable ();
       pointer __old_start = this->_M_impl._M_start;
       pointer __old_finish = this->_M_impl._M_finish;
       const size_type __elems_before = __position - begin();
@@ -571,6 +573,129 @@  _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       this->_M_impl._M_end_of_storage = __new_start + __len;
     }
 
+#if __cplusplus >= 201103L
+  template<typename _Tp, typename _Alloc>
+    template<typename... _Args>
+      _GLIBCXX20_CONSTEXPR
+      void
+      vector<_Tp, _Alloc>::
+      _M_realloc_append(_Args&&... __args)
+#else
+  template<typename _Tp, typename _Alloc>
+    void
+    vector<_Tp, _Alloc>::
+    _M_realloc_append(const _Tp& __x)
+#endif
+    {
+      const size_type __len = _M_check_len(1u, "vector::_M_realloc_append");
+      if (__len <= 0)
+	__builtin_unreachable ();
+      pointer __old_start = this->_M_impl._M_start;
+      pointer __old_finish = this->_M_impl._M_finish;
+      const size_type __elems = end() - begin();
+      pointer __new_start(this->_M_allocate(__len));
+      pointer __new_finish(__new_start);
+
+      // RAII guard for allocated storage.
+      struct _Guard
+      {
+	pointer _M_storage;	    // Storage to deallocate
+	size_type _M_len;
+	_Tp_alloc_type& _M_alloc;
+
+	_GLIBCXX20_CONSTEXPR
+	_Guard(pointer __s, size_type __l, _Tp_alloc_type& __a)
+	: _M_storage(__s), _M_len(__l), _M_alloc(__a)
+	{ }
+
+	_GLIBCXX20_CONSTEXPR
+	~_Guard()
+	{
+	  if (_M_storage)
+	    __gnu_cxx::__alloc_traits<_Tp_alloc_type>::
+	      deallocate(_M_alloc, _M_storage, _M_len);
+	}
+
+      private:
+	_Guard(const _Guard&);
+      };
+
+      {
+	_Guard __guard(__new_start, __len, _M_impl);
+
+	// The order of the three operations is dictated by the C++11
+	// case, where the moves could alter a new element belonging
+	// to the existing vector.  This is an issue only for callers
+	// taking the element by lvalue ref (see last bullet of C++11
+	// [res.on.arguments]).
+
+	// If this throws, the existing elements are unchanged.
+#if __cplusplus >= 201103L
+	_Alloc_traits::construct(this->_M_impl,
+				 std::__to_address(__new_start + __elems),
+				 std::forward<_Args>(__args)...);
+#else
+	_Alloc_traits::construct(this->_M_impl,
+				 __new_start + __elems,
+				 __x);
+#endif
+
+#if __cplusplus >= 201103L
+	if _GLIBCXX17_CONSTEXPR (_S_use_relocate())
+	  {
+	    // Relocation cannot throw.
+	    __new_finish = _S_relocate(__old_start, __old_finish,
+				       __new_start, _M_get_Tp_allocator());
+	    ++__new_finish;
+	  }
+	else
+#endif
+	  {
+	    // RAII type to destroy initialized elements.
+	    struct _Guard_elts
+	    {
+	      pointer _M_first, _M_last;  // Elements to destroy
+	      _Tp_alloc_type& _M_alloc;
+
+	      _GLIBCXX20_CONSTEXPR
+	      _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
+	      : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
+	      { }
+
+	      _GLIBCXX20_CONSTEXPR
+	      ~_Guard_elts()
+	      { std::_Destroy(_M_first, _M_last, _M_alloc); }
+
+	    private:
+	      _Guard_elts(const _Guard_elts&);
+	    };
+
+	    // Guard the new element so it will be destroyed if anything throws.
+	    _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
+
+	    __new_finish = std::__uninitialized_move_if_noexcept_a(
+			     __old_start, __old_finish,
+			     __new_start, _M_get_Tp_allocator());
+
+	    ++__new_finish;
+	    // Guard everything before the new element too.
+	    __guard_elts._M_first = __new_start;
+
+	    // New storage has been fully initialized, destroy the old elements.
+	    __guard_elts._M_first = __old_start;
+	    __guard_elts._M_last = __old_finish;
+	  }
+	__guard._M_storage = __old_start;
+	__guard._M_len = this->_M_impl._M_end_of_storage - __old_start;
+      }
+      // deallocate should be called before assignments to _M_impl,
+      // to avoid call-clobbering
+
+      this->_M_impl._M_start = __new_start;
+      this->_M_impl._M_finish = __new_finish;
+      this->_M_impl._M_end_of_storage = __new_start + __len;
+    }
+
   template<typename _Tp, typename _Alloc>
     _GLIBCXX20_CONSTEXPR
     void