Message ID | Yn50gHFTBUjKlDuC@toto.the-meissners.org |
---|---|
State | New |
Headers | show |
Series | Delay splitting addti3/subti3 until first split pass. | expand |
On Fri, May 13, 2022 at 11:08:48AM -0400, Michael Meissner wrote:
> Add zero_extendditi2. Improve lxvr*x code generation.
<snip>
Nothing in this pass haas anything to do with the subject. Which is a
good thing, because not expanding addti3 etc. to RTL that mimics the
machine insns is not acceptable at all.
Segher
On Fri, 2022-05-13 at 11:08 -0400, Michael Meissner wrote: > Add zero_extendditi2. Improve lxvr*x code generation. > Hi, > Subject: Re: [PATCH] Delay splitting addti3/subti3 until first split pass. Subject does not seem to match contents? > This pattern adds zero_extendditi2 so that if we are extending DImode that > is in a GPR register to TImode in a vector register, the compiler can > generate MTVSRDDD. Just "mtvsrdd". > > In addition the patterns for generating lxvr{b,h,w,d}x were tuned to allow > loading to gpr registers. This prevents needlessly doing direct moves to > get the value into the vector registers if the gpr register was already > selected. > > In updating the insn counts for two tests due to these changes, I noticed > the tests were done at -O0. I changed this so that the tests are now done > at the normal -O2 optimization level. > s/normal/default/ ? > This patch will be needed for an upcoming patch for PR target/103109. > ok > I have built this patch on little endian power10, little endian power9, > and big endian power8 systems. There were no regressions with this > patch. Can I install this on the GCC 13 trunk? > > 2022-05-013 Michael Meissner <meissner@linux.ibm.com> > > gcc/ > * config/rs6000/vsx.md (vsx_lxvr<wd>x): Add support for loading to > GPR registers. > (vsx_stxvr<wd>x): Add support for storing from GPR registers. swap the froms and tos ? > (zero_extendditi2): New insn. > > gcc/testsuite/ > * gcc.target/powerpc/vsx-load-element-extend-int.c: Use -O2 > instead of -O0 and update insn counts. > * gcc.target/powerpc/vsx-load-element-extend-short.c: Likewise. > * gcc.target/powerpc/zero-extend-di-ti.c: New test. > --- > gcc/config/rs6000/vsx.md | 82 +++++++++++++++++-- > .../powerpc/vsx-load-element-extend-int.c | 36 ++++---- > .../powerpc/vsx-load-element-extend-short.c | 35 ++++---- > .../gcc.target/powerpc/zero-extend-di-ti.c | 62 ++++++++++++++ > 4 files changed, 164 insertions(+), 51 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index c091e5e2f47..ad971e3a1de 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -1315,14 +1315,32 @@ (define_expand "vsx_store_<mode>" > } > }) > > -;; Load rightmost element from load_data > -;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx. > -(define_insn "vsx_lxvr<wd>x" > - [(set (match_operand:TI 0 "vsx_register_operand" "=wa") > - (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "Z")))] > - "TARGET_POWER10" > - "lxvr<wd>x %x0,%y1" > - [(set_attr "type" "vecload")]) > +;; Load rightmost element from load_data using lxvrbx, lxvrhx, lxvrwx, lxvrdx. > +;; Support TImode being in a GPR register to prevent generating lvxr{d,w,b}x > +;; and then two direct moves if we ultimately need the value in a GPR register. Perhaps break into two sentences and split the description of what is prevented in a separate sentence. ? > +(define_insn_and_split "vsx_lxvr<wd>x" > + [(set (match_operand:TI 0 "register_operand" "=r,wa") > + (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "m,Z")))] > + "TARGET_POWERPC64 && TARGET_POWER10" > + "@ > + # > + lxvr<wd>x %x0,%y1" > + "&& reload_completed && int_reg_operand (operands[0], TImode)" > + [(set (match_dup 2) (match_dup 3)) > + (set (match_dup 4) (const_int 0))] > +{ > + rtx op0 = operands[0]; > + rtx op1 = operands[1]; > + > + operands[2] = gen_lowpart (DImode, op0); > + operands[3] = (<MODE>mode == DImode > + ? op1 > + : gen_rtx_ZERO_EXTEND (DImode, op1)); > + > + operands[4] = gen_highpart (DImode, op0); > +} > + [(set_attr "type" "load,vecload") > + (set_attr "num_insns" "2,*")]) > > ;; Store rightmost element into store_data > ;; using stxvrbx, stxvrhx, strvxwx, strvxdx. > @@ -5019,6 +5037,54 @@ (define_expand "vsignextend_si_v2di" > DONE; > }) > > +;; Zero extend DI to TI. If we don't have the MTVSRDD instruction (and LXVRDX > +;; in the case of power10), we use the machine independent code. If we are > +;; loading up GPRs, we fall back to the old code. Will 'old code' have meaning to future readers of this lump of code? > +(define_insn_and_split "zero_extendditi2" > + [(set (match_operand:TI 0 "register_operand" "=r,r, wa,&wa") > + (zero_extend:TI (match_operand:DI 1 "register_operand" "r,wa,r, wa")))] > + "TARGET_POWERPC64 && TARGET_P9_VECTOR" > + "@ > + # > + # > + mtvsrdd %x0,0,%1 > + #" > + "&& reload_completed > + && (int_reg_operand (operands[0], TImode) > + || vsx_register_operand (operands[1], DImode))" > + [(pc)] > +{ > + rtx dest = operands[0]; > + rtx src = operands[1]; > + int dest_regno = reg_or_subregno (dest); > + > + /* Handle conversion to GPR registers. Load up the low part and then do > + zero out the upper part. */ s/do// > + if (INT_REGNO_P (dest_regno)) > + { > + rtx dest_hi = gen_highpart (DImode, dest); > + rtx dest_lo = gen_lowpart (DImode, dest); > + > + emit_move_insn (dest_lo, src); > + emit_move_insn (dest_hi, const0_rtx); > + DONE; > + } > + > + /* For settomg a VSX register from another VSX register, clear the result > + register, and use XXPERMDI to shift the value into the lower 64-bits. */ setting > + rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno); > + rtx dest_di = gen_rtx_REG (DImode, dest_regno); > + > + emit_move_insn (dest_v2di, CONST0_RTX (V2DImode)); > + if (BYTES_BIG_ENDIAN) > + emit_insn (gen_vsx_concat_v2di (dest_v2di, dest_di, src)); > + else > + emit_insn (gen_vsx_concat_v2di (dest_v2di, src, dest_di)); > + DONE; > +} > + [(set_attr "type" "integer,mfvsr,vecmove,vecperm") > + (set_attr "length" "8, 8, *, 8")]) > + > ;; Sign extend DI to TI. We provide both GPR targets and Altivec targets on > ;; power10. On earlier systems, the machine independent code will generate a > ;; shift left to sign extend the 64-bit value to 128-bit. > diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > index c40e1a3a0f7..1f1281d6b75 100644 > --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c > @@ -6,33 +6,25 @@ > /* { dg-do compile { target { ! power10_hw } } } */ > /* { dg-require-effective-target power10_ok } */ > /* { dg-require-effective-target int128 } */ > - > -/* Deliberately set optization to zero for this test to confirm > - the lxvr*x instruction is generated. At higher optimization levels > - the instruction we are looking for is sometimes replaced by other > - load instructions. */ > -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */ > - > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */ ok > /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */ > > #define NUM_VEC_ELEMS 4 > #define ITERS 16 > > -/* > -Codegen at time of writing is a single lxvrwx for the zero > -extended test, and a lwax,mtvsrdd,vextsd2q for the sign > -extended test. > - > -0000000010000c90 <test_sign_extended_load>: > - 10000c90: aa 1a 24 7d lwax r9,r4,r3 > - 10000c94: 67 4b 40 7c mtvsrdd vs34,0,r9 > - 10000c98: 02 16 5b 10 vextsd2q v2,v2 > - 10000c9c: 20 00 80 4e blr > - > -0000000010000cb0 <test_zero_extended_unsigned_load>: > - 10000cb0: 9b 18 44 7c lxvrwx vs34,r4,r3 > - 10000cb4: 20 00 80 4e blr > -*/ > +/* Codegen at time of writing is a single lxvrwx for the zero extended test, > + and a lxvrwx + vexts* sign extension instructions for the sign extended > + test. > + > + 0000000000000000 <test_sign_extended_load>: > + 0: 9b 18 44 7c lxvrwx vs34,r4,r3 > + 4: 02 16 5a 10 vextsw2d v2,v2 > + 8: 02 16 5b 10 vextsd2q v2,v2 > + c: 20 00 80 4e blr > + > + 0000000000000020 <test_zero_extended_unsigned_load>: > + 20: 9b 18 44 7c lxvrwx vs34,r4,r3 > + 24: 20 00 80 4e blr */ > > #include <altivec.h> > #include <stdio.h> > diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c > index 837ba79c9ab..a7721318812 100644 > --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c > +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c > @@ -6,33 +6,26 @@ > /* { dg-do compile { target { ! power10_hw } } } */ > /* { dg-require-effective-target power10_ok } */ > /* { dg-require-effective-target int128 } */ > - > -/* Deliberately set optization to zero for this test to confirm > - the lxvr*x instruction is generated. At higher optimization levels > - the instruction we are looking for is sometimes replaced by other > - load instructions. */ > -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */ > > /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */ ok Nothing else jumped out at me from the below. thanks -Will > > #define NUM_VEC_ELEMS 8 > #define ITERS 16 > > -/* > -Codegen at time of writing uses lxvrhx for the zero > -extension test and lhax,mtvsrdd,vextsd2q for the > -sign extended test. > - > -0000000010001810 <test_sign_extended_load>: > - 10001810: ae 1a 24 7d lhax r9,r4,r3 > - 10001814: 67 4b 40 7c mtvsrdd vs34,0,r9 > - 10001818: 02 16 5b 10 vextsd2q v2,v2 > - 1000181c: 20 00 80 4e blr > - > -0000000010001830 <test_zero_extended_unsigned_load>: > - 10001830: 5b 18 44 7c lxvrhx vs34,r4,r3 > - 10001834: 20 00 80 4e blr > -*/ > +/* Codegen at time of writing is a single lxvrwx for the zero extended test, > + and a lxvrwx + vexts* sign extension instructions for the sign extended > + test. > + > + 0000000000000000 <test_sign_extended_load>: > + 0: 5b 18 44 7c lxvrhx vs34,r4,r3 > + 4: 02 16 59 10 vextsh2d v2,v2 > + 8: 02 16 5b 10 vextsd2q v2,v2 > + c: 20 00 80 4e blr > + > + 0000000000000020 <test_zero_extended_unsigned_load>: > + 20: 5b 18 44 7c lxvrhx vs34,r4,r3 > + 24: 20 00 80 4e blr */ > > #include <altivec.h> > #include <stdio.h> > diff --git a/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > new file mode 100644 > index 00000000000..9b3b9c4dbd0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c > @@ -0,0 +1,62 @@ > +/* { dg-require-effective-target int128 } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ > + > +/* This patch makes sure the various optimization and code paths are done for > + zero extending DImode to TImode on power10. */ > + > +__uint128_t > +gpr_to_gpr (unsigned long long a) > +{ > + /* li 4,0. */ > + return a; > +} > + > +__uint128_t > +mem_to_gpr (unsigned long long *p) > +{ > + /* ld 3,0(3); li 4,0. */ > + return *p; > +} > + > +__uint128_t > +vsx_to_gpr (__uint128_t *p, double d) > +{ > + /* fctiduz 1,1; li 4,0;mfvsrd 3,1. */ > + return (unsigned long long)d; > +} > + > +void > +gpr_to_vsx (__uint128_t *p, unsigned long long a) > +{ > + /* mtvsrdd 0,0,4; stxv 0,0(3). */ > + __uint128_t b = a; > + __asm__ (" # %x0" : "+wa" (b)); > + *p = b; > +} > + > +void > +mem_to_vsx (__uint128_t *p, unsigned long long *q) > +{ > + /* lxvrdx 0,0,4; stxv 0,0(3). */ > + __uint128_t a = *q; > + __asm__ (" # %x0" : "+wa" (a)); > + *p = a; > +} > + > +void > +vsx_to_vsx (__uint128_t *p, double d) > +{ > + /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3). */ > + __uint128_t a = (unsigned long long)d; > + __asm__ (" # %x0" : "+wa" (a)); > + *p = a; > +} > + > +/* { dg-final { scan-assembler-times {\mli\M} 3 } } */ > +/* { dg-final { scan-assembler-times {\mld\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mmfvsrd\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mstxv\M} 3 } } */ > +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */ > -- > 2.35.3 > >
On Fri, May 13, 2022 at 12:32:22PM -0500, Segher Boessenkool wrote: > On Fri, May 13, 2022 at 11:08:48AM -0400, Michael Meissner wrote: > > Add zero_extendditi2. Improve lxvr*x code generation. > > <snip> > > Nothing in this pass haas anything to do with the subject. Which is a > good thing, because not expanding addti3 etc. to RTL that mimics the > machine insns is not acceptable at all. Yes, obviously I got mixed up in terms of the title. The title should have been: Add zero_extendditi2. Improve lxvr*x code generation.
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index c091e5e2f47..ad971e3a1de 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -1315,14 +1315,32 @@ (define_expand "vsx_store_<mode>" } }) -;; Load rightmost element from load_data -;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx. -(define_insn "vsx_lxvr<wd>x" - [(set (match_operand:TI 0 "vsx_register_operand" "=wa") - (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "Z")))] - "TARGET_POWER10" - "lxvr<wd>x %x0,%y1" - [(set_attr "type" "vecload")]) +;; Load rightmost element from load_data using lxvrbx, lxvrhx, lxvrwx, lxvrdx. +;; Support TImode being in a GPR register to prevent generating lvxr{d,w,b}x +;; and then two direct moves if we ultimately need the value in a GPR register. +(define_insn_and_split "vsx_lxvr<wd>x" + [(set (match_operand:TI 0 "register_operand" "=r,wa") + (zero_extend:TI (match_operand:INT_ISA3 1 "memory_operand" "m,Z")))] + "TARGET_POWERPC64 && TARGET_POWER10" + "@ + # + lxvr<wd>x %x0,%y1" + "&& reload_completed && int_reg_operand (operands[0], TImode)" + [(set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (const_int 0))] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + + operands[2] = gen_lowpart (DImode, op0); + operands[3] = (<MODE>mode == DImode + ? op1 + : gen_rtx_ZERO_EXTEND (DImode, op1)); + + operands[4] = gen_highpart (DImode, op0); +} + [(set_attr "type" "load,vecload") + (set_attr "num_insns" "2,*")]) ;; Store rightmost element into store_data ;; using stxvrbx, stxvrhx, strvxwx, strvxdx. @@ -5019,6 +5037,54 @@ (define_expand "vsignextend_si_v2di" DONE; }) +;; Zero extend DI to TI. If we don't have the MTVSRDD instruction (and LXVRDX +;; in the case of power10), we use the machine independent code. If we are +;; loading up GPRs, we fall back to the old code. +(define_insn_and_split "zero_extendditi2" + [(set (match_operand:TI 0 "register_operand" "=r,r, wa,&wa") + (zero_extend:TI (match_operand:DI 1 "register_operand" "r,wa,r, wa")))] + "TARGET_POWERPC64 && TARGET_P9_VECTOR" + "@ + # + # + mtvsrdd %x0,0,%1 + #" + "&& reload_completed + && (int_reg_operand (operands[0], TImode) + || vsx_register_operand (operands[1], DImode))" + [(pc)] +{ + rtx dest = operands[0]; + rtx src = operands[1]; + int dest_regno = reg_or_subregno (dest); + + /* Handle conversion to GPR registers. Load up the low part and then do + zero out the upper part. */ + if (INT_REGNO_P (dest_regno)) + { + rtx dest_hi = gen_highpart (DImode, dest); + rtx dest_lo = gen_lowpart (DImode, dest); + + emit_move_insn (dest_lo, src); + emit_move_insn (dest_hi, const0_rtx); + DONE; + } + + /* For settomg a VSX register from another VSX register, clear the result + register, and use XXPERMDI to shift the value into the lower 64-bits. */ + rtx dest_v2di = gen_rtx_REG (V2DImode, dest_regno); + rtx dest_di = gen_rtx_REG (DImode, dest_regno); + + emit_move_insn (dest_v2di, CONST0_RTX (V2DImode)); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_vsx_concat_v2di (dest_v2di, dest_di, src)); + else + emit_insn (gen_vsx_concat_v2di (dest_v2di, src, dest_di)); + DONE; +} + [(set_attr "type" "integer,mfvsr,vecmove,vecperm") + (set_attr "length" "8, 8, *, 8")]) + ;; Sign extend DI to TI. We provide both GPR targets and Altivec targets on ;; power10. On earlier systems, the machine independent code will generate a ;; shift left to sign extend the 64-bit value to 128-bit. diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c index c40e1a3a0f7..1f1281d6b75 100644 --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c @@ -6,33 +6,25 @@ /* { dg-do compile { target { ! power10_hw } } } */ /* { dg-require-effective-target power10_ok } */ /* { dg-require-effective-target int128 } */ - -/* Deliberately set optization to zero for this test to confirm - the lxvr*x instruction is generated. At higher optimization levels - the instruction we are looking for is sometimes replaced by other - load instructions. */ -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */ - +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */ /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */ #define NUM_VEC_ELEMS 4 #define ITERS 16 -/* -Codegen at time of writing is a single lxvrwx for the zero -extended test, and a lwax,mtvsrdd,vextsd2q for the sign -extended test. - -0000000010000c90 <test_sign_extended_load>: - 10000c90: aa 1a 24 7d lwax r9,r4,r3 - 10000c94: 67 4b 40 7c mtvsrdd vs34,0,r9 - 10000c98: 02 16 5b 10 vextsd2q v2,v2 - 10000c9c: 20 00 80 4e blr - -0000000010000cb0 <test_zero_extended_unsigned_load>: - 10000cb0: 9b 18 44 7c lxvrwx vs34,r4,r3 - 10000cb4: 20 00 80 4e blr -*/ +/* Codegen at time of writing is a single lxvrwx for the zero extended test, + and a lxvrwx + vexts* sign extension instructions for the sign extended + test. + + 0000000000000000 <test_sign_extended_load>: + 0: 9b 18 44 7c lxvrwx vs34,r4,r3 + 4: 02 16 5a 10 vextsw2d v2,v2 + 8: 02 16 5b 10 vextsd2q v2,v2 + c: 20 00 80 4e blr + + 0000000000000020 <test_zero_extended_unsigned_load>: + 20: 9b 18 44 7c lxvrwx vs34,r4,r3 + 24: 20 00 80 4e blr */ #include <altivec.h> #include <stdio.h> diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c index 837ba79c9ab..a7721318812 100644 --- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c +++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c @@ -6,33 +6,26 @@ /* { dg-do compile { target { ! power10_hw } } } */ /* { dg-require-effective-target power10_ok } */ /* { dg-require-effective-target int128 } */ - -/* Deliberately set optization to zero for this test to confirm - the lxvr*x instruction is generated. At higher optimization levels - the instruction we are looking for is sometimes replaced by other - load instructions. */ -/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2 -save-temps" } */ /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */ #define NUM_VEC_ELEMS 8 #define ITERS 16 -/* -Codegen at time of writing uses lxvrhx for the zero -extension test and lhax,mtvsrdd,vextsd2q for the -sign extended test. - -0000000010001810 <test_sign_extended_load>: - 10001810: ae 1a 24 7d lhax r9,r4,r3 - 10001814: 67 4b 40 7c mtvsrdd vs34,0,r9 - 10001818: 02 16 5b 10 vextsd2q v2,v2 - 1000181c: 20 00 80 4e blr - -0000000010001830 <test_zero_extended_unsigned_load>: - 10001830: 5b 18 44 7c lxvrhx vs34,r4,r3 - 10001834: 20 00 80 4e blr -*/ +/* Codegen at time of writing is a single lxvrwx for the zero extended test, + and a lxvrwx + vexts* sign extension instructions for the sign extended + test. + + 0000000000000000 <test_sign_extended_load>: + 0: 5b 18 44 7c lxvrhx vs34,r4,r3 + 4: 02 16 59 10 vextsh2d v2,v2 + 8: 02 16 5b 10 vextsd2q v2,v2 + c: 20 00 80 4e blr + + 0000000000000020 <test_zero_extended_unsigned_load>: + 20: 5b 18 44 7c lxvrhx vs34,r4,r3 + 24: 20 00 80 4e blr */ #include <altivec.h> #include <stdio.h> diff --git a/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c new file mode 100644 index 00000000000..9b3b9c4dbd0 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/zero-extend-di-ti.c @@ -0,0 +1,62 @@ +/* { dg-require-effective-target int128 } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* This patch makes sure the various optimization and code paths are done for + zero extending DImode to TImode on power10. */ + +__uint128_t +gpr_to_gpr (unsigned long long a) +{ + /* li 4,0. */ + return a; +} + +__uint128_t +mem_to_gpr (unsigned long long *p) +{ + /* ld 3,0(3); li 4,0. */ + return *p; +} + +__uint128_t +vsx_to_gpr (__uint128_t *p, double d) +{ + /* fctiduz 1,1; li 4,0;mfvsrd 3,1. */ + return (unsigned long long)d; +} + +void +gpr_to_vsx (__uint128_t *p, unsigned long long a) +{ + /* mtvsrdd 0,0,4; stxv 0,0(3). */ + __uint128_t b = a; + __asm__ (" # %x0" : "+wa" (b)); + *p = b; +} + +void +mem_to_vsx (__uint128_t *p, unsigned long long *q) +{ + /* lxvrdx 0,0,4; stxv 0,0(3). */ + __uint128_t a = *q; + __asm__ (" # %x0" : "+wa" (a)); + *p = a; +} + +void +vsx_to_vsx (__uint128_t *p, double d) +{ + /* fctiduz 1,1; xxspltib 0,0; xxpermdi 0,0,1,0; stxv 0,0(3). */ + __uint128_t a = (unsigned long long)d; + __asm__ (" # %x0" : "+wa" (a)); + *p = a; +} + +/* { dg-final { scan-assembler-times {\mli\M} 3 } } */ +/* { dg-final { scan-assembler-times {\mld\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mlxvrdx\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mmfvsrd\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mstxv\M} 3 } } */ +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */