Message ID | 20210726120055.1089971-1-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86-64: Add Avoid_Short_Distance_REP_MOVSB | expand |
On Mon, Jul 26, 2021 at 8:02 AM H.J. Lu via Libc-alpha < libc-alpha@sourceware.org> wrote: > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 > Author: H.J. Lu <hjl.tools@gmail.com> > Date: Sat Jan 25 14:19:40 2020 -0800 > > x86-64: Avoid rep movsb with short distance [BZ #27130] > > introduced some regressions on Intel processors without Fast Short REP > MOV (FSRM). Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with > short distance only on Intel processors with FSRM. bench-memmove-large > on Skylake server shows that cycles of __memmove_evex_unaligned_erms are > improved for the following data size: > > before after Improvement > length=4127, align1=3, align2=0: 479.38 343.00 28% > length=4223, align1=9, align2=5: 405.62 335.50 17% > length=8223, align1=3, align2=0: 786.12 495.00 37% > length=8319, align1=9, align2=5: 256.69 170.38 33% > length=16415, align1=3, align2=0: 1436.88 839.50 41% > length=16511, align1=9, align2=5: 1375.50 840.62 39% > length=32799, align1=3, align2=0: 2890.00 1850.62 36% > length=32895, align1=9, align2=5: 2891.38 1948.62 32% > > There are no regression on Ice Lake server. > On Tigerlake I see some strange results for the random tests: "ifuncs": ["__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms", "__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms", "__memcpy_ssse3_back", "__memcpy_ssse3", "__memcpy_avx512_no_vzeroupper", "__memcpy_avx512_unaligned", "__memcpy_avx512_unaligned_erms", "__memcpy_sse2_unaligned", "__memcpy_sse2_unaligned_erms", "__memcpy_erms"], Without the Patch "length": 4096, "timings": [117793, 118814, 95009.2, 140061, 209016, 162007, 112210, 113011, 139953, 106604, 106483, 116845] With the patch "length": 4096, "timings": [136386, 95256.7, 134947, 102466, 182687, 163942, 110546, 127766, 98344.5, 107647, 109190, 118613] It seems like some of the erms versions are heavily pessimized while the non-erms versions are significantly benefited. I think it has to do with the change in alignment of L(less_vec) though I am not certain. Are you seeing the same performance changes on Skylake/Icelake server? > --- > sysdeps/x86/cacheinfo.h | 7 +++++++ > sysdeps/x86/cpu-features.c | 5 +++++ > .../x86/include/cpu-features-preferred_feature_index_1.def | 1 + > sysdeps/x86/sysdep.h | 3 +++ > sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 5 +++++ > 5 files changed, 21 insertions(+) > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h > index eba8dbc4a6..174ea38f5b 100644 > --- a/sysdeps/x86/cacheinfo.h > +++ b/sysdeps/x86/cacheinfo.h > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = > 2048; > /* Threshold to stop using Enhanced REP MOVSB. */ > long int __x86_rep_movsb_stop_threshold attribute_hidden; > > +/* String/memory function control. */ > +int __x86_string_control attribute_hidden; > + > static void > init_cacheinfo (void) > { > @@ -71,5 +74,9 @@ init_cacheinfo (void) > __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; > __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; > __x86_rep_movsb_stop_threshold = > cpu_features->rep_movsb_stop_threshold; > + > + if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB)) > + __x86_string_control > + |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB; > } > #endif > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > index 706a172ba9..645bba6314 100644 > --- a/sysdeps/x86/cpu-features.c > +++ b/sysdeps/x86/cpu-features.c > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features) > cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] > |= bit_arch_Prefer_AVX2_STRCMP; > } > + > + /* Avoid avoid short distance REP MOVSB on processor with FSRM. */ > + if (CPU_FEATURES_CPU_P (cpu_features, FSRM)) > + cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB] > + |= bit_arch_Avoid_Short_Distance_REP_MOVSB; > } > /* This spells out "AuthenticAMD" or "HygonGenuine". */ > else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) > diff --git > a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > index 133aab19f1..d7c93f00c5 100644 > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) > BIT (MathVec_Prefer_No_AVX512) > BIT (Prefer_FSRM) > BIT (Prefer_AVX2_STRCMP) > +BIT (Avoid_Short_Distance_REP_MOVSB) > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > index 51c069bfe1..35cb90d507 100644 > --- a/sysdeps/x86/sysdep.h > +++ b/sysdeps/x86/sysdep.h > @@ -57,6 +57,9 @@ enum cf_protection_level > #define STATE_SAVE_MASK \ > ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) > > +/* Avoid short distance REP MOVSB. */ > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB (1 << 0) > + > #ifdef __ASSEMBLER__ > > /* Syntactic details of assembler. */ > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > index a783da5de2..9f02624375 100644 > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > @@ -325,12 +325,16 @@ L(movsb): > /* Avoid slow backward REP MOVSB. */ > jb L(more_8x_vec_backward) > # if AVOID_SHORT_DISTANCE_REP_MOVSB > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, > __x86_string_control(%rip) > + jz 3f > movq %rdi, %rcx > subq %rsi, %rcx > jmp 2f > # endif > 1: > # if AVOID_SHORT_DISTANCE_REP_MOVSB > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, > __x86_string_control(%rip) > + jz 3f > movq %rsi, %rcx > subq %rdi, %rcx > 2: > @@ -338,6 +342,7 @@ L(movsb): > is N*4GB + [1..63] with N >= 0. */ > cmpl $63, %ecx > jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ > +3: > # endif > mov %RDX_LP, %RCX_LP > rep movsb > -- > 2.31.1 > >
On Mon, Jul 26, 2021 at 10:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Mon, Jul 26, 2021 at 8:02 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: >> >> commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 >> Author: H.J. Lu <hjl.tools@gmail.com> >> Date: Sat Jan 25 14:19:40 2020 -0800 >> >> x86-64: Avoid rep movsb with short distance [BZ #27130] >> >> introduced some regressions on Intel processors without Fast Short REP >> MOV (FSRM). Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with >> short distance only on Intel processors with FSRM. bench-memmove-large >> on Skylake server shows that cycles of __memmove_evex_unaligned_erms are >> improved for the following data size: >> >> before after Improvement >> length=4127, align1=3, align2=0: 479.38 343.00 28% >> length=4223, align1=9, align2=5: 405.62 335.50 17% >> length=8223, align1=3, align2=0: 786.12 495.00 37% >> length=8319, align1=9, align2=5: 256.69 170.38 33% >> length=16415, align1=3, align2=0: 1436.88 839.50 41% >> length=16511, align1=9, align2=5: 1375.50 840.62 39% >> length=32799, align1=3, align2=0: 2890.00 1850.62 36% >> length=32895, align1=9, align2=5: 2891.38 1948.62 32% >> >> There are no regression on Ice Lake server. > > > On Tigerlake I see some strange results for the random tests: > > "ifuncs": ["__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms", "__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms", "__memcpy_ssse3_back", "__memcpy_ssse3", "__memcpy_avx512_no_vzeroupper", "__memcpy_avx512_unaligned", "__memcpy_avx512_unaligned_erms", "__memcpy_sse2_unaligned", "__memcpy_sse2_unaligned_erms", "__memcpy_erms"], > > Without the Patch > "length": 4096, > "timings": [117793, 118814, 95009.2, 140061, 209016, 162007, 112210, 113011, 139953, 106604, 106483, 116845] > > With the patch > "length": 4096, > "timings": [136386, 95256.7, 134947, 102466, 182687, 163942, 110546, 127766, 98344.5, 107647, 109190, 118613] > > > It seems like some of the erms versions are heavily pessimized while the non-erms versions are significantly > benefited. I think it has to do with the change in alignment of L(less_vec) though I am not certain. I also saw it on Tiger Lake. Please try this patch on top of my patch. > Are you seeing the same performance changes on Skylake/Icelake server? I will check it out. >> >> --- >> sysdeps/x86/cacheinfo.h | 7 +++++++ >> sysdeps/x86/cpu-features.c | 5 +++++ >> .../x86/include/cpu-features-preferred_feature_index_1.def | 1 + >> sysdeps/x86/sysdep.h | 3 +++ >> sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 5 +++++ >> 5 files changed, 21 insertions(+) >> >> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h >> index eba8dbc4a6..174ea38f5b 100644 >> --- a/sysdeps/x86/cacheinfo.h >> +++ b/sysdeps/x86/cacheinfo.h >> @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048; >> /* Threshold to stop using Enhanced REP MOVSB. */ >> long int __x86_rep_movsb_stop_threshold attribute_hidden; >> >> +/* String/memory function control. */ >> +int __x86_string_control attribute_hidden; >> + >> static void >> init_cacheinfo (void) >> { >> @@ -71,5 +74,9 @@ init_cacheinfo (void) >> __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; >> __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; >> __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; >> + >> + if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB)) >> + __x86_string_control >> + |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB; >> } >> #endif >> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c >> index 706a172ba9..645bba6314 100644 >> --- a/sysdeps/x86/cpu-features.c >> +++ b/sysdeps/x86/cpu-features.c >> @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features) >> cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] >> |= bit_arch_Prefer_AVX2_STRCMP; >> } >> + >> + /* Avoid avoid short distance REP MOVSB on processor with FSRM. */ >> + if (CPU_FEATURES_CPU_P (cpu_features, FSRM)) >> + cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB] >> + |= bit_arch_Avoid_Short_Distance_REP_MOVSB; >> } >> /* This spells out "AuthenticAMD" or "HygonGenuine". */ >> else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) >> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> index 133aab19f1..d7c93f00c5 100644 >> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) >> BIT (MathVec_Prefer_No_AVX512) >> BIT (Prefer_FSRM) >> BIT (Prefer_AVX2_STRCMP) >> +BIT (Avoid_Short_Distance_REP_MOVSB) >> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h >> index 51c069bfe1..35cb90d507 100644 >> --- a/sysdeps/x86/sysdep.h >> +++ b/sysdeps/x86/sysdep.h >> @@ -57,6 +57,9 @@ enum cf_protection_level >> #define STATE_SAVE_MASK \ >> ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) >> >> +/* Avoid short distance REP MOVSB. */ >> +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB (1 << 0) >> + >> #ifdef __ASSEMBLER__ >> >> /* Syntactic details of assembler. */ >> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S >> index a783da5de2..9f02624375 100644 >> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S >> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S >> @@ -325,12 +325,16 @@ L(movsb): >> /* Avoid slow backward REP MOVSB. */ >> jb L(more_8x_vec_backward) >> # if AVOID_SHORT_DISTANCE_REP_MOVSB >> + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) >> + jz 3f >> movq %rdi, %rcx >> subq %rsi, %rcx >> jmp 2f >> # endif >> 1: >> # if AVOID_SHORT_DISTANCE_REP_MOVSB >> + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) >> + jz 3f >> movq %rsi, %rcx >> subq %rdi, %rcx >> 2: >> @@ -338,6 +342,7 @@ L(movsb): >> is N*4GB + [1..63] with N >= 0. */ >> cmpl $63, %ecx >> jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ >> +3: >> # endif >> mov %RDX_LP, %RCX_LP >> rep movsb >> -- >> 2.31.1 >>
On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote: > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 > Author: H.J. Lu <hjl.tools@gmail.com> > Date: Sat Jan 25 14:19:40 2020 -0800 > > x86-64: Avoid rep movsb with short distance [BZ #27130] > introduced some regressions on Intel processors without Fast Short REP > MOV (FSRM). Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with > short distance only on Intel processors with FSRM. bench-memmove-large > on Skylake server shows that cycles of __memmove_evex_unaligned_erms are > improved for the following data size: > > before after Improvement > length=4127, align1=3, align2=0: 479.38 343.00 28% > length=4223, align1=9, align2=5: 405.62 335.50 17% > length=8223, align1=3, align2=0: 786.12 495.00 37% > length=8319, align1=9, align2=5: 256.69 170.38 33% > length=16415, align1=3, align2=0: 1436.88 839.50 41% > length=16511, align1=9, align2=5: 1375.50 840.62 39% > length=32799, align1=3, align2=0: 2890.00 1850.62 36% > length=32895, align1=9, align2=5: 2891.38 1948.62 32% > > There are no regression on Ice Lake server. At this point we're waiting on Noah to provide feedback on the performance results given the alignment nop insertion you provided as a follow-up patch (unless you can confirm this yourself). Looking forward to a v2 the incorporates the alignment fix (pending Noah's comments), and my suggestions below. > --- > sysdeps/x86/cacheinfo.h | 7 +++++++ > sysdeps/x86/cpu-features.c | 5 +++++ > .../x86/include/cpu-features-preferred_feature_index_1.def | 1 + > sysdeps/x86/sysdep.h | 3 +++ > sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 5 +++++ > 5 files changed, 21 insertions(+) > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h > index eba8dbc4a6..174ea38f5b 100644 > --- a/sysdeps/x86/cacheinfo.h > +++ b/sysdeps/x86/cacheinfo.h > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048; > /* Threshold to stop using Enhanced REP MOVSB. */ > long int __x86_rep_movsb_stop_threshold attribute_hidden; > > +/* String/memory function control. */ > +int __x86_string_control attribute_hidden; Please expand comment. Suggest: /* A bit-wise OR of string/memory requirements for optimal performance e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB. These bits are used at runtime to tune implementation behavior. */ int __x86_string_control attribute_hidden; > + > static void > init_cacheinfo (void) > { > @@ -71,5 +74,9 @@ init_cacheinfo (void) > __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; > __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; > __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; > + > + if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB)) > + __x86_string_control > + |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB; OK. > } > #endif > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > index 706a172ba9..645bba6314 100644 > --- a/sysdeps/x86/cpu-features.c > +++ b/sysdeps/x86/cpu-features.c > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features) > cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] > |= bit_arch_Prefer_AVX2_STRCMP; > } > + > + /* Avoid avoid short distance REP MOVSB on processor with FSRM. */ > + if (CPU_FEATURES_CPU_P (cpu_features, FSRM)) > + cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB] > + |= bit_arch_Avoid_Short_Distance_REP_MOVSB; OK. > } > /* This spells out "AuthenticAMD" or "HygonGenuine". */ > else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > index 133aab19f1..d7c93f00c5 100644 > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) > BIT (MathVec_Prefer_No_AVX512) > BIT (Prefer_FSRM) > BIT (Prefer_AVX2_STRCMP) > +BIT (Avoid_Short_Distance_REP_MOVSB) OK. > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > index 51c069bfe1..35cb90d507 100644 > --- a/sysdeps/x86/sysdep.h > +++ b/sysdeps/x86/sysdep.h > @@ -57,6 +57,9 @@ enum cf_protection_level > #define STATE_SAVE_MASK \ > ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) > Suggest adding: /* Constants for bits in __x86_string_control: */ > +/* Avoid short distance REP MOVSB. */ > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB (1 << 0) OK. > + > #ifdef __ASSEMBLER__ > > /* Syntactic details of assembler. */ > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > index a783da5de2..9f02624375 100644 > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > @@ -325,12 +325,16 @@ L(movsb): > /* Avoid slow backward REP MOVSB. */ > jb L(more_8x_vec_backward) > # if AVOID_SHORT_DISTANCE_REP_MOVSB > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > + jz 3f OK. > movq %rdi, %rcx > subq %rsi, %rcx > jmp 2f > # endif > 1: > # if AVOID_SHORT_DISTANCE_REP_MOVSB > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > + jz 3f OK. > movq %rsi, %rcx > subq %rdi, %rcx > 2: > @@ -338,6 +342,7 @@ L(movsb): > is N*4GB + [1..63] with N >= 0. */ > cmpl $63, %ecx > jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ > +3: OK. > # endif > mov %RDX_LP, %RCX_LP > rep movsb >
On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com> wrote: > > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote: > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 > > Author: H.J. Lu <hjl.tools@gmail.com> > > Date: Sat Jan 25 14:19:40 2020 -0800 > > > > x86-64: Avoid rep movsb with short distance [BZ #27130] > > introduced some regressions on Intel processors without Fast Short REP > > MOV (FSRM). Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with > > short distance only on Intel processors with FSRM. bench-memmove-large > > on Skylake server shows that cycles of __memmove_evex_unaligned_erms are > > improved for the following data size: > > > > before after Improvement > > length=4127, align1=3, align2=0: 479.38 343.00 28% > > length=4223, align1=9, align2=5: 405.62 335.50 17% > > length=8223, align1=3, align2=0: 786.12 495.00 37% > > length=8319, align1=9, align2=5: 256.69 170.38 33% > > length=16415, align1=3, align2=0: 1436.88 839.50 41% > > length=16511, align1=9, align2=5: 1375.50 840.62 39% > > length=32799, align1=3, align2=0: 2890.00 1850.62 36% > > length=32895, align1=9, align2=5: 2891.38 1948.62 32% > > > > There are no regression on Ice Lake server. > > At this point we're waiting on Noah to provide feedback on the performance > results given the alignment nop insertion you provided as a follow-up patch We are testing 25 byte nop padding now: https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8 > (unless you can confirm this yourself). > > Looking forward to a v2 the incorporates the alignment fix (pending Noah's > comments), and my suggestions below. > > > --- > > sysdeps/x86/cacheinfo.h | 7 +++++++ > > sysdeps/x86/cpu-features.c | 5 +++++ > > .../x86/include/cpu-features-preferred_feature_index_1.def | 1 + > > sysdeps/x86/sysdep.h | 3 +++ > > sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 5 +++++ > > 5 files changed, 21 insertions(+) > > > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h > > index eba8dbc4a6..174ea38f5b 100644 > > --- a/sysdeps/x86/cacheinfo.h > > +++ b/sysdeps/x86/cacheinfo.h > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048; > > /* Threshold to stop using Enhanced REP MOVSB. */ > > long int __x86_rep_movsb_stop_threshold attribute_hidden; > > > > +/* String/memory function control. */ > > +int __x86_string_control attribute_hidden; > > Please expand comment. > > Suggest: > > /* A bit-wise OR of string/memory requirements for optimal performance > e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB. These bits > are used at runtime to tune implementation behavior. */ > int __x86_string_control attribute_hidden; I will fix it in the v2 patch. Thanks. > > + > > static void > > init_cacheinfo (void) > > { > > @@ -71,5 +74,9 @@ init_cacheinfo (void) > > __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; > > __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; > > __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; > > + > > + if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB)) > > + __x86_string_control > > + |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB; > > OK. > > > } > > #endif > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > > index 706a172ba9..645bba6314 100644 > > --- a/sysdeps/x86/cpu-features.c > > +++ b/sysdeps/x86/cpu-features.c > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features) > > cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] > > |= bit_arch_Prefer_AVX2_STRCMP; > > } > > + > > + /* Avoid avoid short distance REP MOVSB on processor with FSRM. */ > > + if (CPU_FEATURES_CPU_P (cpu_features, FSRM)) > > + cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB] > > + |= bit_arch_Avoid_Short_Distance_REP_MOVSB; > > OK. > > > } > > /* This spells out "AuthenticAMD" or "HygonGenuine". */ > > else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > index 133aab19f1..d7c93f00c5 100644 > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) > > BIT (MathVec_Prefer_No_AVX512) > > BIT (Prefer_FSRM) > > BIT (Prefer_AVX2_STRCMP) > > +BIT (Avoid_Short_Distance_REP_MOVSB) > > OK. > > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > > index 51c069bfe1..35cb90d507 100644 > > --- a/sysdeps/x86/sysdep.h > > +++ b/sysdeps/x86/sysdep.h > > @@ -57,6 +57,9 @@ enum cf_protection_level > > #define STATE_SAVE_MASK \ > > ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) > > > > Suggest adding: > > /* Constants for bits in __x86_string_control: */ > > > +/* Avoid short distance REP MOVSB. */ > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB (1 << 0) > > OK. > > > + > > #ifdef __ASSEMBLER__ > > > > /* Syntactic details of assembler. */ > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > index a783da5de2..9f02624375 100644 > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > @@ -325,12 +325,16 @@ L(movsb): > > /* Avoid slow backward REP MOVSB. */ > > jb L(more_8x_vec_backward) > > # if AVOID_SHORT_DISTANCE_REP_MOVSB > > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > > + jz 3f > > OK. > > > movq %rdi, %rcx > > subq %rsi, %rcx > > jmp 2f > > # endif > > 1: > > # if AVOID_SHORT_DISTANCE_REP_MOVSB > > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > > + jz 3f > > OK. > > > movq %rsi, %rcx > > subq %rdi, %rcx > > 2: > > @@ -338,6 +342,7 @@ L(movsb): > > is N*4GB + [1..63] with N >= 0. */ > > cmpl $63, %ecx > > jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ > > +3: > > OK. > > > # endif > > mov %RDX_LP, %RCX_LP > > rep movsb > > > > > -- > Cheers, > Carlos. >
On Mon, Jul 26, 2021 at 11:11 PM H.J. Lu via Libc-alpha < libc-alpha@sourceware.org> wrote: > On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com> wrote: > > > > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote: > > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 > > > Author: H.J. Lu <hjl.tools@gmail.com> > > > Date: Sat Jan 25 14:19:40 2020 -0800 > > > > > > x86-64: Avoid rep movsb with short distance [BZ #27130] > > > introduced some regressions on Intel processors without Fast Short REP > > > MOV (FSRM). Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with > > > short distance only on Intel processors with FSRM. bench-memmove-large > > > on Skylake server shows that cycles of __memmove_evex_unaligned_erms > are > > > improved for the following data size: > > > > > > before after Improvement > > > length=4127, align1=3, align2=0: 479.38 343.00 28% > > > length=4223, align1=9, align2=5: 405.62 335.50 17% > > > length=8223, align1=3, align2=0: 786.12 495.00 37% > > > length=8319, align1=9, align2=5: 256.69 170.38 33% > > > length=16415, align1=3, align2=0: 1436.88 839.50 41% > > > length=16511, align1=9, align2=5: 1375.50 840.62 39% > > > length=32799, align1=3, align2=0: 2890.00 1850.62 36% > > > length=32895, align1=9, align2=5: 2891.38 1948.62 32% > > > > > > There are no regression on Ice Lake server. > > > > At this point we're waiting on Noah to provide feedback on the > performance > > results given the alignment nop insertion you provided as a follow-up > patch > The results with the padding look good! > > We are testing 25 byte nop padding now: > > https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8 > > How did you come to the exact padding choice used? > > (unless you can confirm this yourself). > > > > Looking forward to a v2 the incorporates the alignment fix (pending > Noah's > > comments), and my suggestions below. > > > > --- > > > sysdeps/x86/cacheinfo.h | 7 +++++++ > > > sysdeps/x86/cpu-features.c | 5 +++++ > > > .../x86/include/cpu-features-preferred_feature_index_1.def | 1 + > > > sysdeps/x86/sysdep.h | 3 +++ > > > sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 5 +++++ > > > 5 files changed, 21 insertions(+) > > > > > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h > > > index eba8dbc4a6..174ea38f5b 100644 > > > --- a/sysdeps/x86/cacheinfo.h > > > +++ b/sysdeps/x86/cacheinfo.h > > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden > = 2048; > > > /* Threshold to stop using Enhanced REP MOVSB. */ > > > long int __x86_rep_movsb_stop_threshold attribute_hidden; > > > > > > +/* String/memory function control. */ > > > +int __x86_string_control attribute_hidden; > > > > Please expand comment. > > > > Suggest: > > > > /* A bit-wise OR of string/memory requirements for optimal performance > > e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB. These bits > > are used at runtime to tune implementation behavior. */ > > int __x86_string_control attribute_hidden; > > I will fix it in the v2 patch. > > Thanks. > > > > + > > > static void > > > init_cacheinfo (void) > > > { > > > @@ -71,5 +74,9 @@ init_cacheinfo (void) > > > __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; > > > __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; > > > __x86_rep_movsb_stop_threshold = > cpu_features->rep_movsb_stop_threshold; > > > + > > > + if (CPU_FEATURES_ARCH_P (cpu_features, > Avoid_Short_Distance_REP_MOVSB)) > > > + __x86_string_control > > > + |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB; > > > > OK. > > > > > } > > > #endif > > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > > > index 706a172ba9..645bba6314 100644 > > > --- a/sysdeps/x86/cpu-features.c > > > +++ b/sysdeps/x86/cpu-features.c > > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features > *cpu_features) > > > cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] > > > |= bit_arch_Prefer_AVX2_STRCMP; > > > } > > > + > > > + /* Avoid avoid short distance REP MOVSB on processor with > FSRM. */ > > > + if (CPU_FEATURES_CPU_P (cpu_features, FSRM)) > > > + > cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB] > > > + |= bit_arch_Avoid_Short_Distance_REP_MOVSB; > > > > OK. > > > > > } > > > /* This spells out "AuthenticAMD" or "HygonGenuine". */ > > > else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == > 0x69746e65) > > > diff --git > a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > index 133aab19f1..d7c93f00c5 100644 > > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def > > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) > > > BIT (MathVec_Prefer_No_AVX512) > > > BIT (Prefer_FSRM) > > > BIT (Prefer_AVX2_STRCMP) > > > +BIT (Avoid_Short_Distance_REP_MOVSB) > > > > OK. > > > > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > > > index 51c069bfe1..35cb90d507 100644 > > > --- a/sysdeps/x86/sysdep.h > > > +++ b/sysdeps/x86/sysdep.h > > > @@ -57,6 +57,9 @@ enum cf_protection_level > > > #define STATE_SAVE_MASK \ > > > ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) > > > > > > > Suggest adding: > > > > /* Constants for bits in __x86_string_control: */ > > > > > +/* Avoid short distance REP MOVSB. */ > > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB (1 << 0) > > > > OK. > > > > > + > > > #ifdef __ASSEMBLER__ > > > > > > /* Syntactic details of assembler. */ > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > index a783da5de2..9f02624375 100644 > > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > @@ -325,12 +325,16 @@ L(movsb): > > > /* Avoid slow backward REP MOVSB. */ > > > jb L(more_8x_vec_backward) > > > # if AVOID_SHORT_DISTANCE_REP_MOVSB > > > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, > __x86_string_control(%rip) > > > + jz 3f > > > > OK. > > > > > movq %rdi, %rcx > > > subq %rsi, %rcx > > > jmp 2f > > > # endif > > > 1: > > > # if AVOID_SHORT_DISTANCE_REP_MOVSB > > > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, > __x86_string_control(%rip) > > > + jz 3f > > > > OK. > > > > > movq %rsi, %rcx > > > subq %rdi, %rcx > > > 2: > > > @@ -338,6 +342,7 @@ L(movsb): > > > is N*4GB + [1..63] with N >= 0. */ > > > cmpl $63, %ecx > > > jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ > > > +3: > > > > OK. > > > > > # endif > > > mov %RDX_LP, %RCX_LP > > > rep movsb > > > > > > > > > -- > > Cheers, > > Carlos. > > > > > -- > H.J. >
On Mon, Jul 26, 2021 at 3:03 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > @@ -325,12 +325,16 @@ L(movsb): > /* Avoid slow backward REP MOVSB. */ > jb L(more_8x_vec_backward) > # if AVOID_SHORT_DISTANCE_REP_MOVSB > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > + jz 3f > movq %rdi, %rcx > subq %rsi, %rcx > jmp 2f > # endif > 1: > # if AVOID_SHORT_DISTANCE_REP_MOVSB > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) > + jz 3f > movq %rsi, %rcx > subq %rdi, %rcx > 2: Why "andl" rather than "testl"?
On Fri, Aug 27, 2021 at 8:28 PM Alexey Tourbin via Libc-alpha < libc-alpha@sourceware.org> wrote: > On Mon, Jul 26, 2021 at 3:03 PM H.J. Lu via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > @@ -325,12 +325,16 @@ L(movsb): > > /* Avoid slow backward REP MOVSB. */ > > jb L(more_8x_vec_backward) > > # if AVOID_SHORT_DISTANCE_REP_MOVSB > > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, > __x86_string_control(%rip) > > + jz 3f > > movq %rdi, %rcx > > subq %rsi, %rcx > > jmp 2f > > # endif > > 1: > > # if AVOID_SHORT_DISTANCE_REP_MOVSB > > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, > __x86_string_control(%rip) > > + jz 3f > > movq %rsi, %rcx > > subq %rdi, %rcx > > 2: > > Why "andl" rather than "testl"? > +1. I missed that before. My patches: [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S uses `testl` instead. Although my patch might not be checked in (and it may be a while) as we are still looking into the causes of the slowdown.
On Fri, Aug 27, 2021 at 7:58 PM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Fri, Aug 27, 2021 at 8:28 PM Alexey Tourbin via Libc-alpha < > libc-alpha@sourceware.org> wrote: > > > On Mon, Jul 26, 2021 at 3:03 PM H.J. Lu via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S > > > @@ -325,12 +325,16 @@ L(movsb): > > > /* Avoid slow backward REP MOVSB. */ > > > jb L(more_8x_vec_backward) > > > # if AVOID_SHORT_DISTANCE_REP_MOVSB > > > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, > > __x86_string_control(%rip) > > > + jz 3f > > > movq %rdi, %rcx > > > subq %rsi, %rcx > > > jmp 2f > > > # endif > > > 1: > > > # if AVOID_SHORT_DISTANCE_REP_MOVSB > > > + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, > > __x86_string_control(%rip) > > > + jz 3f > > > movq %rsi, %rcx > > > subq %rdi, %rcx > > > 2: > > > > Why "andl" rather than "testl"? > > > > +1. I missed that before. > > My patches: [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S > > uses `testl` instead. Although my patch might not be checked in (and it may > be a > while) as we are still looking into the causes of the slowdown. I would like to backport this patch to release branches. Any comments or objections? --Sunil
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h index eba8dbc4a6..174ea38f5b 100644 --- a/sysdeps/x86/cacheinfo.h +++ b/sysdeps/x86/cacheinfo.h @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048; /* Threshold to stop using Enhanced REP MOVSB. */ long int __x86_rep_movsb_stop_threshold attribute_hidden; +/* String/memory function control. */ +int __x86_string_control attribute_hidden; + static void init_cacheinfo (void) { @@ -71,5 +74,9 @@ init_cacheinfo (void) __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; + + if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB)) + __x86_string_control + |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB; } #endif diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 706a172ba9..645bba6314 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features) cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] |= bit_arch_Prefer_AVX2_STRCMP; } + + /* Avoid avoid short distance REP MOVSB on processor with FSRM. */ + if (CPU_FEATURES_CPU_P (cpu_features, FSRM)) + cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB] + |= bit_arch_Avoid_Short_Distance_REP_MOVSB; } /* This spells out "AuthenticAMD" or "HygonGenuine". */ else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def index 133aab19f1..d7c93f00c5 100644 --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) BIT (MathVec_Prefer_No_AVX512) BIT (Prefer_FSRM) BIT (Prefer_AVX2_STRCMP) +BIT (Avoid_Short_Distance_REP_MOVSB) diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index 51c069bfe1..35cb90d507 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -57,6 +57,9 @@ enum cf_protection_level #define STATE_SAVE_MASK \ ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) +/* Avoid short distance REP MOVSB. */ +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB (1 << 0) + #ifdef __ASSEMBLER__ /* Syntactic details of assembler. */ diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index a783da5de2..9f02624375 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -325,12 +325,16 @@ L(movsb): /* Avoid slow backward REP MOVSB. */ jb L(more_8x_vec_backward) # if AVOID_SHORT_DISTANCE_REP_MOVSB + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) + jz 3f movq %rdi, %rcx subq %rsi, %rcx jmp 2f # endif 1: # if AVOID_SHORT_DISTANCE_REP_MOVSB + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) + jz 3f movq %rsi, %rcx subq %rdi, %rcx 2: @@ -338,6 +342,7 @@ L(movsb): is N*4GB + [1..63] with N >= 0. */ cmpl $63, %ecx jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ +3: # endif mov %RDX_LP, %RCX_LP rep movsb