Message ID | 5728B8CC.7030405@eelis.net |
---|---|
State | New |
Headers | show |
>>> On 2016-05-01 16:18, Eelis wrote: >>>> The attached patch optimizes std::shuffle for the very common case >>>> where the generator range is large enough that a single invocation >>>> can produce two swap positions. >>>> >>>> This reduces the runtime of the following testcase by 37% on my machine: Gentle ping. :) Did anyone get a chance to take a look at this? Does the idea seem sound? Does the implementation seem correct? Thanks, Eelis
On 03/05/16 16:42 +0200, Eelis van der Weegen wrote: >Ah, thanks, I forgot to re-attach when I sent to include the libstdc++ list. > >On 2016-05-03 14:38, Jonathan Wakely wrote: >>ENOPATCH >> >>On 1 May 2016 at 15:21, Eelis <eelis@eelis.net> wrote: >>>Sorry, forgot to include the libstdc++ list. >>> >>>On 2016-05-01 16:18, Eelis wrote: >>>> >>>>Hi, >>>> >>>>The attached patch optimizes std::shuffle for the very common case >>>>where the generator range is large enough that a single invocation >>>>can produce two swap positions. >>>> >>>>This reduces the runtime of the following testcase by 37% on my machine: >>>> >>>> int main() >>>> { >>>> std::mt19937 gen; >>>> >>>> std::vector<int> v; >>>> v.reserve(10000); >>>> for (int i = 0; i != 10000; ++i) >>>> { >>>> v.push_back(i); >>>> std::shuffle(v.begin(), v.end(), gen); >>>> } >>>> >>>> std::cout << v.front() << '\n'; >>>> } >>>> >>>>Thoughts? >>>> >>>>Thanks, >>>> >>>>Eelis >>>> >>> >>> > >Index: libstdc++-v3/include/bits/stl_algo.h >=================================================================== >--- libstdc++-v3/include/bits/stl_algo.h (revision 235680) >+++ libstdc++-v3/include/bits/stl_algo.h (working copy) >@@ -3708,6 +3708,22 @@ > #endif > > #ifdef _GLIBCXX_USE_C99_STDINT_TR1 >+ >+ template<typename _IntType, typename _UniformRandomNumberGenerator> We should avoid introducing new names based on "uniform random number generator" and use _UniformRandomBitGenerator as per https://wg21.link/p0346r1 >+ inline _IntType >+ __generate_random_index_below(_IntType __bound, _UniformRandomNumberGenerator& __g) >+ { >+ const _IntType __urngrange = __g.max() - __g.min() + 1; Similarly, let's use __urbgrange here. >+ const _IntType __scaling = __urngrange / __bound; I think I'd like either a comment on the function documenting the assumption about __bound and __g, or an explicit check: __glibcxx_assert( __scaling >= __bound ); >+ const _IntType __past = __bound * __scaling; >+ >+ for (;;) >+ { >+ const _IntType __r = _IntType(__g()) - __g.min(); >+ if (__r < __past) return __r / __scaling; This is basically the same algorithm as uniform_int_distribution so doesn't introduce any bias, right? Is this significantly faster than just using uniform_int_distribution<_IntType>{0, __bound - 1}(__g) so we don't need to duplicate the logic? (And people maintaining the code won't reconvince themselves it's correct every time they look at it :-) >+ } >+ } >+ > /** > * @brief Shuffle the elements of a sequence using a uniform random > * number generator. >@@ -3740,6 +3756,40 @@ > typedef typename std::make_unsigned<_DistanceType>::type __ud_type; > typedef typename std::uniform_int_distribution<__ud_type> __distr_type; > typedef typename __distr_type::param_type __p_type; >+ >+ typedef typename std::remove_reference<_UniformRandomNumberGenerator>::type _Gen; >+ typedef typename std::common_type<typename _Gen::result_type, __ud_type>::type __uc_type; >+ >+ const __uc_type __urngrange = _Gen::max() - _Gen::min() + 1; >+ const __uc_type __urange = __uc_type(__last - __first); >+ >+ if (__urngrange / __urange >= __urange) >+ // I.e. (__urngrange >= __urange * __urange) but without wrap issues. >+ { >+ for (_RandomAccessIterator __i = __first + 1; __i != __last; ) >+ { >+ const __uc_type __swap_range = __uc_type(__i - __first) + 1; >+ >+ if (__i + 1 == __last) Could we hoist this test out of the loop somehow? If we change the loop condition to be __i+1 < __last we don't need to test it on every iteration, and then after the loop we can just do the final swap if (__urange % 2). >+ { >+ const __uc_type __pos = __generate_random_index_below(__swap_range, __g); >+ std::iter_swap(__i, __first + __pos); >+ return; >+ } >+ >+ // Use a single generator invocation to produce swap positions for >+ // both of the next two elements: >+ >+ const __uc_type __comp_range = __swap_range * (__swap_range + 1); >+ const __uc_type __pospos = __generate_random_index_below(__comp_range, __g); >+ >+ std::iter_swap(__i++, __first + (__pospos % __swap_range)); >+ std::iter_swap(__i++, __first + (__pospos / __swap_range)); I think I've convinced myself this is correct :-) Values of __pospos will be uniformly distributed in [0, __comp_range) and so the / and % results will be too. >+ } >+ >+ return; >+ } >+ > __distr_type __d; > > for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
On 31/08/16 13:45 +0100, Jonathan Wakely wrote: >On 03/05/16 16:42 +0200, Eelis van der Weegen wrote: >>Ah, thanks, I forgot to re-attach when I sent to include the libstdc++ list. >> >>On 2016-05-03 14:38, Jonathan Wakely wrote: >>>ENOPATCH >>> >>>On 1 May 2016 at 15:21, Eelis <eelis@eelis.net> wrote: >>>>Sorry, forgot to include the libstdc++ list. >>>> >>>>On 2016-05-01 16:18, Eelis wrote: >>>>> >>>>>Hi, >>>>> >>>>>The attached patch optimizes std::shuffle for the very common case >>>>>where the generator range is large enough that a single invocation >>>>>can produce two swap positions. >>>>> >>>>>This reduces the runtime of the following testcase by 37% on my machine: >>>>> >>>>> int main() >>>>> { >>>>> std::mt19937 gen; >>>>> >>>>> std::vector<int> v; >>>>> v.reserve(10000); >>>>> for (int i = 0; i != 10000; ++i) >>>>> { >>>>> v.push_back(i); >>>>> std::shuffle(v.begin(), v.end(), gen); >>>>> } >>>>> >>>>> std::cout << v.front() << '\n'; >>>>> } >>>>> >>>>>Thoughts? >>>>> >>>>>Thanks, >>>>> >>>>>Eelis >>>>> >>>> >>>> >> > >>Index: libstdc++-v3/include/bits/stl_algo.h >>=================================================================== >>--- libstdc++-v3/include/bits/stl_algo.h (revision 235680) >>+++ libstdc++-v3/include/bits/stl_algo.h (working copy) >>@@ -3708,6 +3708,22 @@ >>#endif >> >>#ifdef _GLIBCXX_USE_C99_STDINT_TR1 >>+ >>+ template<typename _IntType, typename _UniformRandomNumberGenerator> > >We should avoid introducing new names based on "uniform random number >generator" and use _UniformRandomBitGenerator as per >https://wg21.link/p0346r1 > >>+ inline _IntType >>+ __generate_random_index_below(_IntType __bound, _UniformRandomNumberGenerator& __g) >>+ { >>+ const _IntType __urngrange = __g.max() - __g.min() + 1; > >Similarly, let's use __urbgrange here. > >>+ const _IntType __scaling = __urngrange / __bound; > >I think I'd like either a comment on the function documenting the >assumption about __bound and __g, or an explicit check: > > __glibcxx_assert( __scaling >= __bound ); > >>+ const _IntType __past = __bound * __scaling; >>+ >>+ for (;;) >>+ { >>+ const _IntType __r = _IntType(__g()) - __g.min(); >>+ if (__r < __past) return __r / __scaling; > >This is basically the same algorithm as uniform_int_distribution so >doesn't introduce any bias, right? > >Is this significantly faster than just using >uniform_int_distribution<_IntType>{0, __bound - 1}(__g) so we don't >need to duplicate the logic? (And people maintaining the code won't >reconvince themselves it's correct every time they look at it :-) > > >>+ } >>+ } >>+ >> /** >> * @brief Shuffle the elements of a sequence using a uniform random >> * number generator. >>@@ -3740,6 +3756,40 @@ >> typedef typename std::make_unsigned<_DistanceType>::type __ud_type; >> typedef typename std::uniform_int_distribution<__ud_type> __distr_type; >> typedef typename __distr_type::param_type __p_type; >>+ >>+ typedef typename std::remove_reference<_UniformRandomNumberGenerator>::type _Gen; >>+ typedef typename std::common_type<typename _Gen::result_type, __ud_type>::type __uc_type; >>+ >>+ const __uc_type __urngrange = _Gen::max() - _Gen::min() + 1; >>+ const __uc_type __urange = __uc_type(__last - __first); >>+ >>+ if (__urngrange / __urange >= __urange) >>+ // I.e. (__urngrange >= __urange * __urange) but without wrap issues. >>+ { >>+ for (_RandomAccessIterator __i = __first + 1; __i != __last; ) >>+ { >>+ const __uc_type __swap_range = __uc_type(__i - __first) + 1; >>+ >>+ if (__i + 1 == __last) > >Could we hoist this test out of the loop somehow? > >If we change the loop condition to be __i+1 < __last we don't need to >test it on every iteration, and then after the loop we can just do >the final swap if (__urange % 2). > >>+ { >>+ const __uc_type __pos = __generate_random_index_below(__swap_range, __g); >>+ std::iter_swap(__i, __first + __pos); >>+ return; >>+ } >>+ >>+ // Use a single generator invocation to produce swap positions for >>+ // both of the next two elements: >>+ >>+ const __uc_type __comp_range = __swap_range * (__swap_range + 1); >>+ const __uc_type __pospos = __generate_random_index_below(__comp_range, __g); >>+ >>+ std::iter_swap(__i++, __first + (__pospos % __swap_range)); >>+ std::iter_swap(__i++, __first + (__pospos / __swap_range)); > >I think I've convinced myself this is correct :-) > >Values of __pospos will be uniformly distributed in [0, __comp_range) iThis is true, but ... >and so the / and % results will be too. This isn't. If __swap_range is 3, then __comp_range is 10 and __pospos is uniformly distributed in [0, 9]. (__pospos % __swap_range) is not uniformly distributed, we get P(0) = 0.4, P(1) = 0.3, P(2) = 0.3. Similarly, (__pospos / __swap_range) is not uniform, we get P(0) = 0.3, P(1) = 0.3, P(2) = 0.3, P(3) = 0.1 This means that certain permuations of the input are more likely than others, which fails to meet the requirements of the function. Or is there a flaw in my reasoning?
On Thu, 1 Sep 2016, Jonathan Wakely wrote: >>> + const __uc_type __comp_range = __swap_range * (__swap_range + 1); > > If __swap_range is 3, then __comp_range is 10 and ???
On 2016-09-01 17:14, Jonathan Wakely wrote: > On 31/08/16 13:45 +0100, Jonathan Wakely wrote: >> On 03/05/16 16:42 +0200, Eelis van der Weegen wrote: >>> Ah, thanks, I forgot to re-attach when I sent to include the libstdc++ list. >>> >>> On 2016-05-03 14:38, Jonathan Wakely wrote: >>>> ENOPATCH >>>> >>>> On 1 May 2016 at 15:21, Eelis <eelis@eelis.net> wrote: >>>>> Sorry, forgot to include the libstdc++ list. >>>>> >>>>> On 2016-05-01 16:18, Eelis wrote: >>>>>> >>>>>> Hi, >>>>>> >>>>>> The attached patch optimizes std::shuffle for the very common case >>>>>> where the generator range is large enough that a single invocation >>>>>> can produce two swap positions. >>>>>> >>>>>> This reduces the runtime of the following testcase by 37% on my machine: >>>>>> >>>>>> int main() >>>>>> { >>>>>> std::mt19937 gen; >>>>>> >>>>>> std::vector<int> v; >>>>>> v.reserve(10000); >>>>>> for (int i = 0; i != 10000; ++i) >>>>>> { >>>>>> v.push_back(i); >>>>>> std::shuffle(v.begin(), v.end(), gen); >>>>>> } >>>>>> >>>>>> std::cout << v.front() << '\n'; >>>>>> } >>>>>> >>>>>> Thoughts? >>>>>> >>>>>> Thanks, >>>>>> >>>>>> Eelis >>>>>> >>>>> >>>>> >>> >> >>> Index: libstdc++-v3/include/bits/stl_algo.h >>> =================================================================== >>> --- libstdc++-v3/include/bits/stl_algo.h (revision 235680) >>> +++ libstdc++-v3/include/bits/stl_algo.h (working copy) >>> @@ -3708,6 +3708,22 @@ >>> #endif >>> >>> #ifdef _GLIBCXX_USE_C99_STDINT_TR1 >>> + >>> + template<typename _IntType, typename _UniformRandomNumberGenerator> >> >> We should avoid introducing new names based on "uniform random number >> generator" and use _UniformRandomBitGenerator as per >> https://wg21.link/p0346r1 >> >>> + inline _IntType >>> + __generate_random_index_below(_IntType __bound, _UniformRandomNumberGenerator& __g) >>> + { >>> + const _IntType __urngrange = __g.max() - __g.min() + 1; >> >> Similarly, let's use __urbgrange here. >> >>> + const _IntType __scaling = __urngrange / __bound; >> >> I think I'd like either a comment on the function documenting the >> assumption about __bound and __g, or an explicit check: >> >> __glibcxx_assert( __scaling >= __bound ); >> >>> + const _IntType __past = __bound * __scaling; >>> + >>> + for (;;) >>> + { >>> + const _IntType __r = _IntType(__g()) - __g.min(); >>> + if (__r < __past) return __r / __scaling; >> >> This is basically the same algorithm as uniform_int_distribution so >> doesn't introduce any bias, right? >> >> Is this significantly faster than just using >> uniform_int_distribution<_IntType>{0, __bound - 1}(__g) so we don't >> need to duplicate the logic? (And people maintaining the code won't >> reconvince themselves it's correct every time they look at it :-) >> >> >>> + } >>> + } >>> + >>> /** >>> * @brief Shuffle the elements of a sequence using a uniform random >>> * number generator. >>> @@ -3740,6 +3756,40 @@ >>> typedef typename std::make_unsigned<_DistanceType>::type __ud_type; >>> typedef typename std::uniform_int_distribution<__ud_type> __distr_type; >>> typedef typename __distr_type::param_type __p_type; >>> + >>> + typedef typename std::remove_reference<_UniformRandomNumberGenerator>::type _Gen; >>> + typedef typename std::common_type<typename _Gen::result_type, __ud_type>::type __uc_type; >>> + >>> + const __uc_type __urngrange = _Gen::max() - _Gen::min() + 1; >>> + const __uc_type __urange = __uc_type(__last - __first); >>> + >>> + if (__urngrange / __urange >= __urange) >>> + // I.e. (__urngrange >= __urange * __urange) but without wrap issues. >>> + { >>> + for (_RandomAccessIterator __i = __first + 1; __i != __last; ) >>> + { >>> + const __uc_type __swap_range = __uc_type(__i - __first) + 1; >>> + >>> + if (__i + 1 == __last) >> >> Could we hoist this test out of the loop somehow? >> >> If we change the loop condition to be __i+1 < __last we don't need to >> test it on every iteration, and then after the loop we can just do >> the final swap if (__urange % 2). >> >>> + { >>> + const __uc_type __pos = __generate_random_index_below(__swap_range, __g); >>> + std::iter_swap(__i, __first + __pos); >>> + return; >>> + } >>> + >>> + // Use a single generator invocation to produce swap positions for >>> + // both of the next two elements: >>> + >>> + const __uc_type __comp_range = __swap_range * (__swap_range + 1); >>> + const __uc_type __pospos = __generate_random_index_below(__comp_range, __g); >>> + >>> + std::iter_swap(__i++, __first + (__pospos % __swap_range)); >>> + std::iter_swap(__i++, __first + (__pospos / __swap_range)); >> >> I think I've convinced myself this is correct :-) >> >> Values of __pospos will be uniformly distributed in [0, __comp_range) > > iThis is true, but ... > >> and so the / and % results will be too. > > This isn't. > > If __swap_range is 3, then __comp_range is 10 and > __pospos is uniformly distributed in [0, 9]. > > (__pospos % __swap_range) is not uniformly distributed, we get > P(0) = 0.4, P(1) = 0.3, P(2) = 0.3. > > Similarly, (__pospos / __swap_range) is not uniform, we get > P(0) = 0.3, P(1) = 0.3, P(2) = 0.3, P(3) = 0.1 > > This means that certain permuations of the input are more likely than > others, which fails to meet the requirements of the function. > > Or is there a flaw in my reasoning? Just that if __swap_range is 3, then __comp_range is 3*(3+1)=12, not 10. :) Thanks for the review! I'll send an updated patch addressing the other issues soon.
On 01/09/16 17:27 +0200, Marc Glisse wrote: >On Thu, 1 Sep 2016, Jonathan Wakely wrote: > >>>>+ const __uc_type __comp_range = __swap_range * (__swap_range + 1); >> >>If __swap_range is 3, then __comp_range is 10 and > >??? Bah :-) Thanks. I guess I read the code correctly the other day at least!
On 01/09/16 17:31 +0200, Eelis van der Weegen wrote: >On 2016-09-01 17:14, Jonathan Wakely wrote: >>On 31/08/16 13:45 +0100, Jonathan Wakely wrote: >>>On 03/05/16 16:42 +0200, Eelis van der Weegen wrote: >>>>Ah, thanks, I forgot to re-attach when I sent to include the libstdc++ list. >>>> >>>>On 2016-05-03 14:38, Jonathan Wakely wrote: >>>>>ENOPATCH >>>>> >>>>>On 1 May 2016 at 15:21, Eelis <eelis@eelis.net> wrote: >>>>>>Sorry, forgot to include the libstdc++ list. >>>>>> >>>>>>On 2016-05-01 16:18, Eelis wrote: >>>>>>> >>>>>>>Hi, >>>>>>> >>>>>>>The attached patch optimizes std::shuffle for the very common case >>>>>>>where the generator range is large enough that a single invocation >>>>>>>can produce two swap positions. >>>>>>> >>>>>>>This reduces the runtime of the following testcase by 37% on my machine: >>>>>>> >>>>>>> int main() >>>>>>> { >>>>>>> std::mt19937 gen; >>>>>>> >>>>>>> std::vector<int> v; >>>>>>> v.reserve(10000); >>>>>>> for (int i = 0; i != 10000; ++i) >>>>>>> { >>>>>>> v.push_back(i); >>>>>>> std::shuffle(v.begin(), v.end(), gen); >>>>>>> } >>>>>>> >>>>>>> std::cout << v.front() << '\n'; >>>>>>> } >>>>>>> >>>>>>>Thoughts? >>>>>>> >>>>>>>Thanks, >>>>>>> >>>>>>>Eelis >>>>>>> >>>>>> >>>>>> >>>> >>> >>>>Index: libstdc++-v3/include/bits/stl_algo.h >>>>=================================================================== >>>>--- libstdc++-v3/include/bits/stl_algo.h (revision 235680) >>>>+++ libstdc++-v3/include/bits/stl_algo.h (working copy) >>>>@@ -3708,6 +3708,22 @@ >>>>#endif >>>> >>>>#ifdef _GLIBCXX_USE_C99_STDINT_TR1 >>>>+ >>>>+ template<typename _IntType, typename _UniformRandomNumberGenerator> >>> >>>We should avoid introducing new names based on "uniform random number >>>generator" and use _UniformRandomBitGenerator as per >>>https://wg21.link/p0346r1 >>> >>>>+ inline _IntType >>>>+ __generate_random_index_below(_IntType __bound, _UniformRandomNumberGenerator& __g) >>>>+ { >>>>+ const _IntType __urngrange = __g.max() - __g.min() + 1; >>> >>>Similarly, let's use __urbgrange here. >>> >>>>+ const _IntType __scaling = __urngrange / __bound; >>> >>>I think I'd like either a comment on the function documenting the >>>assumption about __bound and __g, or an explicit check: >>> >>> __glibcxx_assert( __scaling >= __bound ); >>> >>>>+ const _IntType __past = __bound * __scaling; >>>>+ >>>>+ for (;;) >>>>+ { >>>>+ const _IntType __r = _IntType(__g()) - __g.min(); >>>>+ if (__r < __past) return __r / __scaling; >>> >>>This is basically the same algorithm as uniform_int_distribution so >>>doesn't introduce any bias, right? >>> >>>Is this significantly faster than just using >>>uniform_int_distribution<_IntType>{0, __bound - 1}(__g) so we don't >>>need to duplicate the logic? (And people maintaining the code won't >>>reconvince themselves it's correct every time they look at it :-) >>> >>> >>>>+ } >>>>+ } >>>>+ >>>> /** >>>> * @brief Shuffle the elements of a sequence using a uniform random >>>> * number generator. >>>>@@ -3740,6 +3756,40 @@ >>>> typedef typename std::make_unsigned<_DistanceType>::type __ud_type; >>>> typedef typename std::uniform_int_distribution<__ud_type> __distr_type; >>>> typedef typename __distr_type::param_type __p_type; >>>>+ >>>>+ typedef typename std::remove_reference<_UniformRandomNumberGenerator>::type _Gen; >>>>+ typedef typename std::common_type<typename _Gen::result_type, __ud_type>::type __uc_type; >>>>+ >>>>+ const __uc_type __urngrange = _Gen::max() - _Gen::min() + 1; >>>>+ const __uc_type __urange = __uc_type(__last - __first); >>>>+ >>>>+ if (__urngrange / __urange >= __urange) >>>>+ // I.e. (__urngrange >= __urange * __urange) but without wrap issues. >>>>+ { >>>>+ for (_RandomAccessIterator __i = __first + 1; __i != __last; ) >>>>+ { >>>>+ const __uc_type __swap_range = __uc_type(__i - __first) + 1; >>>>+ >>>>+ if (__i + 1 == __last) >>> >>>Could we hoist this test out of the loop somehow? >>> >>>If we change the loop condition to be __i+1 < __last we don't need to >>>test it on every iteration, and then after the loop we can just do >>>the final swap if (__urange % 2). >>> >>>>+ { >>>>+ const __uc_type __pos = __generate_random_index_below(__swap_range, __g); >>>>+ std::iter_swap(__i, __first + __pos); >>>>+ return; >>>>+ } >>>>+ >>>>+ // Use a single generator invocation to produce swap positions for >>>>+ // both of the next two elements: >>>>+ >>>>+ const __uc_type __comp_range = __swap_range * (__swap_range + 1); >>>>+ const __uc_type __pospos = __generate_random_index_below(__comp_range, __g); >>>>+ >>>>+ std::iter_swap(__i++, __first + (__pospos % __swap_range)); >>>>+ std::iter_swap(__i++, __first + (__pospos / __swap_range)); >>> >>>I think I've convinced myself this is correct :-) >>> >>>Values of __pospos will be uniformly distributed in [0, __comp_range) >> >>iThis is true, but ... >> >>>and so the / and % results will be too. >> >>This isn't. >> >>If __swap_range is 3, then __comp_range is 10 and >>__pospos is uniformly distributed in [0, 9]. >> >>(__pospos % __swap_range) is not uniformly distributed, we get >>P(0) = 0.4, P(1) = 0.3, P(2) = 0.3. >> >>Similarly, (__pospos / __swap_range) is not uniform, we get >>P(0) = 0.3, P(1) = 0.3, P(2) = 0.3, P(3) = 0.1 >> >>This means that certain permuations of the input are more likely than >>others, which fails to meet the requirements of the function. >> >>Or is there a flaw in my reasoning? > >Just that if __swap_range is 3, then __comp_range is 3*(3+1)=12, not 10. :) A minor detail! >Thanks for the review! I'll send an updated patch addressing the other issues soon. I think my earlier comments are still sane, just ignore today's hasty email.
Index: libstdc++-v3/include/bits/stl_algo.h =================================================================== --- libstdc++-v3/include/bits/stl_algo.h (revision 235680) +++ libstdc++-v3/include/bits/stl_algo.h (working copy) @@ -3708,6 +3708,22 @@ #endif #ifdef _GLIBCXX_USE_C99_STDINT_TR1 + + template<typename _IntType, typename _UniformRandomNumberGenerator> + inline _IntType + __generate_random_index_below(_IntType __bound, _UniformRandomNumberGenerator& __g) + { + const _IntType __urngrange = __g.max() - __g.min() + 1; + const _IntType __scaling = __urngrange / __bound; + const _IntType __past = __bound * __scaling; + + for (;;) + { + const _IntType __r = _IntType(__g()) - __g.min(); + if (__r < __past) return __r / __scaling; + } + } + /** * @brief Shuffle the elements of a sequence using a uniform random * number generator. @@ -3740,6 +3756,40 @@ typedef typename std::make_unsigned<_DistanceType>::type __ud_type; typedef typename std::uniform_int_distribution<__ud_type> __distr_type; typedef typename __distr_type::param_type __p_type; + + typedef typename std::remove_reference<_UniformRandomNumberGenerator>::type _Gen; + typedef typename std::common_type<typename _Gen::result_type, __ud_type>::type __uc_type; + + const __uc_type __urngrange = _Gen::max() - _Gen::min() + 1; + const __uc_type __urange = __uc_type(__last - __first); + + if (__urngrange / __urange >= __urange) + // I.e. (__urngrange >= __urange * __urange) but without wrap issues. + { + for (_RandomAccessIterator __i = __first + 1; __i != __last; ) + { + const __uc_type __swap_range = __uc_type(__i - __first) + 1; + + if (__i + 1 == __last) + { + const __uc_type __pos = __generate_random_index_below(__swap_range, __g); + std::iter_swap(__i, __first + __pos); + return; + } + + // Use a single generator invocation to produce swap positions for + // both of the next two elements: + + const __uc_type __comp_range = __swap_range * (__swap_range + 1); + const __uc_type __pospos = __generate_random_index_below(__comp_range, __g); + + std::iter_swap(__i++, __first + (__pospos % __swap_range)); + std::iter_swap(__i++, __first + (__pospos / __swap_range)); + } + + return; + } + __distr_type __d; for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)