Message ID | 20240509090951.1336223-1-admin@levyhsu.com |
---|---|
State | New |
Headers | show |
Series | [1/1] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563] | expand |
On Thu, May 9, 2024 at 11:12 AM Levy Hsu <admin@levyhsu.com> wrote: > > Hi All > > We've introduced a new subroutine in ix86_expand_vec_perm_const_1 > to optimize vector shifting for the V16QI type on x86. > This patch uses a three-instruction sequence psrlw, psllw, and por > to handle specific vector shuffle operations more efficiently. > The change aims to improve assembly code generation for configurations > supporting SSE2. > > Bootstrapped and tested on x86_64-linux-gnu, OK for trunk? > > Best > Levy > > gcc/ChangeLog: > > PR target/107563 > * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New > subroutine. > (ix86_expand_vec_perm_const_1): New Entry. Please say (ix86_expand_vec_perm_const_1): Call expand_vec_perm_psrlw_psllw_por. > > gcc/testsuite/ChangeLog: > > PR target/107563 > * g++.target/i386/pr107563-a.C: New test. > * g++.target/i386/pr107563-b.C: New test. OK with the above adjustment. Thanks, Uros. > --- > gcc/config/i386/i386-expand.cc | 64 ++++++++++++++++++++++ > gcc/testsuite/g++.target/i386/pr107563-a.C | 13 +++++ > gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++++ > 3 files changed, 89 insertions(+) > create mode 100755 gcc/testsuite/g++.target/i386/pr107563-a.C > create mode 100755 gcc/testsuite/g++.target/i386/pr107563-b.C > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index 2f27bfb484c..5098d2886bb 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn) > return true; > } > > +/* A subroutine of ix86_expand_vec_perm_const_1. > + Implement a permutation with psrlw, psllw and por. > + It handles case: > + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14); > + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */ > + > +static bool > +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d) > +{ > + unsigned i; > + rtx (*gen_shr) (rtx, rtx, rtx); > + rtx (*gen_shl) (rtx, rtx, rtx); > + rtx (*gen_or) (rtx, rtx, rtx); > + machine_mode mode = VOIDmode; > + > + if (!TARGET_SSE2 || !d->one_operand_p) > + return false; > + > + switch (d->vmode) > + { > + case E_V8QImode: > + if (!TARGET_MMX_WITH_SSE) > + return false; > + mode = V4HImode; > + gen_shr = gen_ashrv4hi3; > + gen_shl = gen_ashlv4hi3; > + gen_or = gen_iorv4hi3; > + break; > + case E_V16QImode: > + mode = V8HImode; > + gen_shr = gen_vlshrv8hi3; > + gen_shl = gen_vashlv8hi3; > + gen_or = gen_iorv8hi3; > + break; > + default: return false; > + } > + > + if (!rtx_equal_p (d->op0, d->op1)) > + return false; > + > + for (i = 0; i < d->nelt; i += 2) > + if (d->perm[i] != i + 1 || d->perm[i + 1] != i) > + return false; > + > + if (d->testing_p) > + return true; > + > + rtx tmp1 = gen_reg_rtx (mode); > + rtx tmp2 = gen_reg_rtx (mode); > + rtx op0 = force_reg (d->vmode, d->op0); > + > + emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode)); > + emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode)); > + emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8))); > + emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8))); > + emit_insn (gen_or (tmp1, tmp1, tmp2)); > + emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode)); > + > + return true; > +} > + > /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF > permutation using two vperm2f128, followed by a vshufpd insn blending > the two vectors together. */ > @@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) > if (expand_vec_perm_2perm_pblendv (d, false)) > return true; > > + if (expand_vec_perm_psrlw_psllw_por (d)) > + return true; > + > /* Try sequences of four instructions. */ > > if (expand_vec_perm_even_odd_trunc (d)) > diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C b/gcc/testsuite/g++.target/i386/pr107563-a.C > new file mode 100755 > index 00000000000..605c1bdf814 > --- /dev/null > +++ b/gcc/testsuite/g++.target/i386/pr107563-a.C > @@ -0,0 +1,13 @@ > +/* PR target/107563.C */ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-std=c++2b -O3 -msse2" } */ > +/* { dg-final { scan-assembler-times "psllw" 1 } } */ > +/* { dg-final { scan-assembler-times "psraw" 1 } } */ > +/* { dg-final { scan-assembler-times "por" 1 } } */ > + > +using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char; > + > +void foo2(temp_vec_type2& v) noexcept > +{ > + v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6); > +} > diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C b/gcc/testsuite/g++.target/i386/pr107563-b.C > new file mode 100755 > index 00000000000..0ce3e8263bb > --- /dev/null > +++ b/gcc/testsuite/g++.target/i386/pr107563-b.C > @@ -0,0 +1,12 @@ > +/* PR target/107563.C */ > +/* { dg-options "-std=c++2b -O3 -msse2" } */ > +/* { dg-final { scan-assembler-times "psllw" 1 } } */ > +/* { dg-final { scan-assembler-times "psrlw" 1 } } */ > +/* { dg-final { scan-assembler-times "por" 1 } } */ > + > +using temp_vec_type [[__gnu__::__vector_size__(16)]] = char; > + > +void foo(temp_vec_type& v) noexcept > +{ > + v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); > +} > -- > 2.31.1 >
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..5098d2886bb 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn) return true; } +/* A subroutine of ix86_expand_vec_perm_const_1. + Implement a permutation with psrlw, psllw and por. + It handles case: + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14); + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */ + +static bool +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d) +{ + unsigned i; + rtx (*gen_shr) (rtx, rtx, rtx); + rtx (*gen_shl) (rtx, rtx, rtx); + rtx (*gen_or) (rtx, rtx, rtx); + machine_mode mode = VOIDmode; + + if (!TARGET_SSE2 || !d->one_operand_p) + return false; + + switch (d->vmode) + { + case E_V8QImode: + if (!TARGET_MMX_WITH_SSE) + return false; + mode = V4HImode; + gen_shr = gen_ashrv4hi3; + gen_shl = gen_ashlv4hi3; + gen_or = gen_iorv4hi3; + break; + case E_V16QImode: + mode = V8HImode; + gen_shr = gen_vlshrv8hi3; + gen_shl = gen_vashlv8hi3; + gen_or = gen_iorv8hi3; + break; + default: return false; + } + + if (!rtx_equal_p (d->op0, d->op1)) + return false; + + for (i = 0; i < d->nelt; i += 2) + if (d->perm[i] != i + 1 || d->perm[i + 1] != i) + return false; + + if (d->testing_p) + return true; + + rtx tmp1 = gen_reg_rtx (mode); + rtx tmp2 = gen_reg_rtx (mode); + rtx op0 = force_reg (d->vmode, d->op0); + + emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode)); + emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode)); + emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8))); + emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8))); + emit_insn (gen_or (tmp1, tmp1, tmp2)); + emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode)); + + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF permutation using two vperm2f128, followed by a vshufpd insn blending the two vectors together. */ @@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_2perm_pblendv (d, false)) return true; + if (expand_vec_perm_psrlw_psllw_por (d)) + return true; + /* Try sequences of four instructions. */ if (expand_vec_perm_even_odd_trunc (d)) diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C b/gcc/testsuite/g++.target/i386/pr107563-a.C new file mode 100755 index 00000000000..605c1bdf814 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr107563-a.C @@ -0,0 +1,13 @@ +/* PR target/107563.C */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-std=c++2b -O3 -msse2" } */ +/* { dg-final { scan-assembler-times "psllw" 1 } } */ +/* { dg-final { scan-assembler-times "psraw" 1 } } */ +/* { dg-final { scan-assembler-times "por" 1 } } */ + +using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char; + +void foo2(temp_vec_type2& v) noexcept +{ + v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6); +} diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C b/gcc/testsuite/g++.target/i386/pr107563-b.C new file mode 100755 index 00000000000..0ce3e8263bb --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr107563-b.C @@ -0,0 +1,12 @@ +/* PR target/107563.C */ +/* { dg-options "-std=c++2b -O3 -msse2" } */ +/* { dg-final { scan-assembler-times "psllw" 1 } } */ +/* { dg-final { scan-assembler-times "psrlw" 1 } } */ +/* { dg-final { scan-assembler-times "por" 1 } } */ + +using temp_vec_type [[__gnu__::__vector_size__(16)]] = char; + +void foo(temp_vec_type& v) noexcept +{ + v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); +}