diff mbox series

[2/2,rs6000,V2] VSX load/store rightmost element operations

Message ID eb9649eac9cb6e77be670962a6b582db6d2539ef.camel@vnet.ibm.com
State New
Headers show
Series None | expand

Commit Message

will schmidt Oct. 20, 2020, 9:34 p.m. UTC
[PATCH 2/2, rs6000, v2] VSX load/store rightmost element operations

Hi,
This adds support for the VSX load/store rightmost element operations.
This includes the instructions lxvrbx, lxvrhx, lxvrwx, lxvrdx,
stxvrbx, stxvrhx, stxvrwx, stxvrdx; And the builtins
vec_xl_sext() /* vector load sign extend */
vec_xl_zext() /* vector load zero extend */
vec_xst_trunc() /* vector store truncate */.

Testcase results show that the instructions added with this patch show
up at low/no optimization (-O0), with a number of those being replaced
with other load and store instructions at higher optimization levels.
I've kept those tests at -O0 to confirm these newly added instructions
are generated.

[v2] Refreshed per review comments.
Comments cleaned up, indentation corrected, dg-* stanzas updated.

Regtested OK for Linux on power8,power9 targets.  Sniff-regtested OK on
power10 simulator.
OK for trunk?

Thanks,
-Will

gcc/ChangeLog:
    * config/rs6000/altivec.h (vec_xl_zext, vec_xl_sext, vec_xst_trunc): New
    defines.
    * config/rs6000/rs6000-builtin.def (BU_P10V_OVERLOAD_X): New builtin macro.
    (BU_P10V_AV_X): New builtin macro.
    (se_lxvrhbx, se_lxrbhx, se_lxvrwx, se_lxvrdx): Define internal names for
    load and sign extend vector element.
    (ze_lxvrbx, ze_lxvrhx, ze_lxvrwx, ze_lxvrdx): Define internal names for
    load and zero extend vector element.
    (tr_stxvrbx, tr_stxvrhx, tr_stxvrwx, tr_stxvrdx): Define internal names
    for truncate and store vector element.
    (se_lxvrx, ze_lxvrx, tr_stxvrx): Define internal names for overloaded
    load/store rightmost element.
    * config/rs6000/rs6000-call.c (altivec_builtin_types): Define the internal
    monomorphs P10_BUILTIN_SE_LXVRBX, P10_BUILTIN_SE_LXVRHX,
    P10_BUILTIN_SE_LXVRWX, P10_BUILTIN_SE_LXVRDX,
    P10_BUILTIN_ZE_LXVRBX, P10_BUILTIN_ZE_LXVRHX, P10_BUILTIN_ZE_LXVRWX,
    P10_BUILTIN_ZE_LXVRDX,
    P10_BUILTIN_TR_STXVRBX, P10_BUILTIN_TR_STXVRHX, P10_BUILTIN_TR_STXVRWX,
    P10_BUILTIN_TR_STXVRDX,
    (altivec_expand_lxvr_builtin): New expansion for load element builtins.
    (altivec_expand_stv_builtin): Update to support truncate and store builtins.
    (altivec_expand_builtin): Add clases for the load/store rightmost builtins.
    (altivec_init_builtins): Add def_builtin entries for
    __builtin_altivec_se_lxvrbx, __builtin_altivec_se_lxvrhx,
    __builtin_altivec_se_lxvrwx, __builtin_altivec_se_lxvrdx,
    __builtin_altivec_ze_lxvrbx, __builtin_altivec_ze_lxvrhx,
    __builtin_altivec_ze_lxvrwx, __builtin_altivec_ze_lxvrdx,
    __builtin_altivec_tr_stxvrbx, __builtin_altivec_tr_stxvrhx,
    __builtin_altivec_tr_stxvrwx, __builtin_altivec_tr_stxvrdx,
    __builtin_vec_se_lxvrx, __builtin_vec_ze_lxvrx, __builtin_vec_tr_stxvrx.
    * config/rs6000/vsx.md (vsx_lxvr<wd>x, vsx_stxvr<wd>x, vsx_stxvr<wd>x):
    New define_insn entries.
    * gcc/doc/extend.texi:  Add documentation for vsx_xl_sext, vsx_xl_zext,
    and vec_xst_trunc.

gcc/testsuite/ChangeLog:
    * gcc.target/powerpc/vsx-load-element-extend-char.c: New test.
    * gcc.target/powerpc/vsx-load-element-extend-int.c: New test.
    * gcc.target/powerpc/vsx-load-element-extend-longlong.c: New test.
    * gcc.target/powerpc/vsx-load-element-extend-short.c: New test.
    * gcc.target/powerpc/vsx-store-element-truncate-char.c: New test.
    * gcc.target/powerpc/vsx-store-element-truncate-int.c: New test.
    * gcc.target/powerpc/vsx-store-element-truncate-longlong.c: New test.
    * gcc.target/powerpc/vsx-store-element-truncate-short.c: New test.

Comments

Segher Boessenkool Oct. 21, 2020, 8:57 p.m. UTC | #1
On Tue, Oct 20, 2020 at 04:34:46PM -0500, will schmidt wrote:
> This adds support for the VSX load/store rightmost element operations.
> This includes the instructions lxvrbx, lxvrhx, lxvrwx, lxvrdx,
> stxvrbx, stxvrhx, stxvrwx, stxvrdx; And the builtins
> vec_xl_sext() /* vector load sign extend */
> vec_xl_zext() /* vector load zero extend */
> vec_xst_trunc() /* vector store truncate */.

I think this is fine now.  Thanks!  Okay for trunk.


Segher
diff mbox series

Patch

diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h
index 8a2dcda01442..df10a8c498dd 100644
--- a/gcc/config/rs6000/altivec.h
+++ b/gcc/config/rs6000/altivec.h
@@ -234,10 +234,13 @@ 
 #define vec_lde __builtin_vec_lde
 #define vec_ldl __builtin_vec_ldl
 #define vec_lvebx __builtin_vec_lvebx
 #define vec_lvehx __builtin_vec_lvehx
 #define vec_lvewx __builtin_vec_lvewx
+#define vec_xl_zext __builtin_vec_ze_lxvrx
+#define vec_xl_sext __builtin_vec_se_lxvrx
+#define vec_xst_trunc __builtin_vec_tr_stxvrx
 #define vec_neg __builtin_vec_neg
 #define vec_pmsum_be __builtin_vec_vpmsum
 #define vec_shasigma_be __builtin_crypto_vshasigma
 /* Cell only intrinsics.  */
 #ifdef __PPU__
diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def
index 3eb55f0ae434..5b05da87f4bf 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -1143,10 +1143,18 @@ 
 		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
 		     | RS6000_BTC_BINARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 #endif
 
+#define BU_P10V_OVERLOAD_X(ENUM, NAME)				\
+  RS6000_BUILTIN_X (P10_BUILTIN_VEC_ ## ENUM,		/* ENUM */	\
+		    "__builtin_vec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P10,			/* MASK */	\
+		    (RS6000_BTC_OVERLOADED		/* ATTR */	\
+		     | RS6000_BTC_SPECIAL),				\
+		    CODE_FOR_nothing)			/* ICODE */
+
 /* Power 10 Altivec builtins  */
 
 #define BU_P10V_AV_0(ENUM, NAME, ATTR, ICODE)				\
   RS6000_BUILTIN_0 (P10V_BUILTIN_ ## ENUM,		/* ENUM */	\
 		    "__builtin_altivec_" NAME,		/* NAME */	\
@@ -1177,10 +1185,19 @@ 
 		    RS6000_BTM_P10,			/* MASK */	\
 		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
 		     | RS6000_BTC_TERNARY),				\
 		    CODE_FOR_ ## ICODE)			/* ICODE */
 
+#define BU_P10V_AV_X(ENUM, NAME, ATTR)					\
+  RS6000_BUILTIN_X (P10_BUILTIN_ ## ENUM,		/* ENUM */	\
+		    "__builtin_altivec_" NAME,		/* NAME */	\
+		    RS6000_BTM_P10,			/* MASK */	\
+		    (RS6000_BTC_ ## ATTR		/* ATTR */	\
+		     | RS6000_BTC_SPECIAL),				\
+		    CODE_FOR_nothing)			/* ICODE */
+
+
 
 /* Insure 0 is not a legitimate index.  */
 BU_SPECIAL_X (RS6000_BUILTIN_NONE, NULL, 0, RS6000_BTC_MISC)
 
 /* 3 argument Altivec builtins.  */
@@ -1472,10 +1489,22 @@  BU_ALTIVEC_X (DSS,		"dss",		    MISC)
 BU_ALTIVEC_X (LVSL,		"lvsl",		    PURE)
 BU_ALTIVEC_X (LVSR,		"lvsr",		    PURE)
 BU_ALTIVEC_X (LVEBX,		"lvebx",	    PURE)
 BU_ALTIVEC_X (LVEHX,		"lvehx",	    PURE)
 BU_ALTIVEC_X (LVEWX,		"lvewx",	    PURE)
+BU_P10V_AV_X (SE_LXVRBX,	"se_lxvrbx",	    PURE)
+BU_P10V_AV_X (SE_LXVRHX,	"se_lxvrhx",	    PURE)
+BU_P10V_AV_X (SE_LXVRWX,	"se_lxvrwx",	    PURE)
+BU_P10V_AV_X (SE_LXVRDX,	"se_lxvrdx",	    PURE)
+BU_P10V_AV_X (ZE_LXVRBX,	"ze_lxvrbx",	    PURE)
+BU_P10V_AV_X (ZE_LXVRHX,	"ze_lxvrhx",	    PURE)
+BU_P10V_AV_X (ZE_LXVRWX,	"ze_lxvrwx",	    PURE)
+BU_P10V_AV_X (ZE_LXVRDX,	"ze_lxvrdx",	    PURE)
+BU_P10V_AV_X (TR_STXVRBX,	"tr_stxvrbx",	    MEM)
+BU_P10V_AV_X (TR_STXVRHX,	"tr_stxvrhx",	    MEM)
+BU_P10V_AV_X (TR_STXVRWX,	"tr_stxvrwx",	    MEM)
+BU_P10V_AV_X (TR_STXVRDX,	"tr_stxvrdx",	    MEM)
 BU_ALTIVEC_X (LVXL,		"lvxl",		    PURE)
 BU_ALTIVEC_X (LVXL_V2DF,	"lvxl_v2df",	    PURE)
 BU_ALTIVEC_X (LVXL_V2DI,	"lvxl_v2di",	    PURE)
 BU_ALTIVEC_X (LVXL_V4SF,	"lvxl_v4sf",	    PURE)
 BU_ALTIVEC_X (LVXL_V4SI,	"lvxl_v4si",	    PURE)
@@ -1738,10 +1767,13 @@  BU_ALTIVEC_OVERLOAD_X (LD,	   "ld")
 BU_ALTIVEC_OVERLOAD_X (LDE,	   "lde")
 BU_ALTIVEC_OVERLOAD_X (LDL,	   "ldl")
 BU_ALTIVEC_OVERLOAD_X (LVEBX,	   "lvebx")
 BU_ALTIVEC_OVERLOAD_X (LVEHX,	   "lvehx")
 BU_ALTIVEC_OVERLOAD_X (LVEWX,	   "lvewx")
+BU_P10V_OVERLOAD_X (SE_LXVRX,   "se_lxvrx")
+BU_P10V_OVERLOAD_X (ZE_LXVRX,   "ze_lxvrx")
+BU_P10V_OVERLOAD_X (TR_STXVRX,  "tr_stxvrx")
 BU_ALTIVEC_OVERLOAD_X (LVLX,	   "lvlx")
 BU_ALTIVEC_OVERLOAD_X (LVLXL,	   "lvlxl")
 BU_ALTIVEC_OVERLOAD_X (LVRX,	   "lvrx")
 BU_ALTIVEC_OVERLOAD_X (LVRXL,	   "lvrxl")
 BU_ALTIVEC_OVERLOAD_X (LVSL,	   "lvsl")
diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index 9fdf97bc8031..de24ce57c77b 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -1150,10 +1150,69 @@  const struct altivec_builtin_types altivec_overloaded_builtins[] = {
   { ALTIVEC_BUILTIN_VEC_LVEBX, ALTIVEC_BUILTIN_LVEBX,
     RS6000_BTI_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_INTQI, 0 },
   { ALTIVEC_BUILTIN_VEC_LVEBX, ALTIVEC_BUILTIN_LVEBX,
     RS6000_BTI_unsigned_V16QI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTQI, 0 },
 
+  /* vector signed__int128 vec_xl_sext (signed long long, signed char *);
+     vector signed__int128 vec_xl_sext (signed long long, signed short *);
+     vector signed__int128 vec_xl_sext (signed long long, signed int *);
+     vector signed__int128 vec_xl_sext (signed long long, signed longlong *); */
+  { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRBX,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTQI, 0 },
+  { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRHX,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTHI, 0 },
+  { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRWX,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTSI, 0 },
+  { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRDX,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTDI, 0 },
+  { P10_BUILTIN_VEC_SE_LXVRX, P10_BUILTIN_SE_LXVRDX,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_long_long, 0 },
+
+  /* vector unsigned__int128 vec_xl_zext (signed long long, unsigned char *);
+     vector unsigned__int128 vec_xl_zext (signed long long, unsigned short *);
+     vector unsigned__int128 vec_xl_zext (signed long long, unsigned int *);
+     vector unsigned__int128 vec_xl_zext (signed long long, unsigned longlong *); */
+  { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRBX,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTQI, 0 },
+  { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRHX,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTHI, 0 },
+  { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRWX,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTSI, 0 },
+  { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRDX,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTDI, 0 },
+  { P10_BUILTIN_VEC_ZE_LXVRX, P10_BUILTIN_ZE_LXVRDX,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_unsigned_long_long, 0 },
+
+  /* void vec_xst_trunc (vector signed __int128, signed long long, signed char *);
+     void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *);
+     void vec_xst_trunc (vector signed __int128, signed long long, signed char *);
+     void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *);
+     void vec_xst_trunc (vector signed __int128, signed long long, signed char *);
+     void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *);
+     void vec_xst_trunc (vector signed __int128, signed long long, signed char *);
+     void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *); */
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRBX, RS6000_BTI_void,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTQI },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRBX, RS6000_BTI_void,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTQI },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRHX, RS6000_BTI_void,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTHI },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRHX, RS6000_BTI_void,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTHI },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRWX, RS6000_BTI_void,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTSI },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRWX, RS6000_BTI_void,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTSI },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRDX, RS6000_BTI_void,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_long_long },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRDX, RS6000_BTI_void,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_unsigned_long_long },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRDX, RS6000_BTI_void,
+    RS6000_BTI_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_INTDI },
+  { P10_BUILTIN_VEC_TR_STXVRX, P10_BUILTIN_TR_STXVRDX, RS6000_BTI_void,
+    RS6000_BTI_unsigned_V1TI, RS6000_BTI_INTSI, ~RS6000_BTI_UINTDI },
+
   /*     vector float vec_ldl (int, vector float *);
          vector float vec_ldl (int, float *); */
   { ALTIVEC_BUILTIN_VEC_LDL, ALTIVEC_BUILTIN_LVXL_V4SF,
     RS6000_BTI_V4SF, RS6000_BTI_INTSI, ~RS6000_BTI_V4SF, 0 },
   { ALTIVEC_BUILTIN_VEC_LDL, ALTIVEC_BUILTIN_LVXL_V4SF,
@@ -9572,10 +9631,89 @@  swap_endian_selector_for_mode (machine_mode mode)
 
   return force_reg (V16QImode, gen_rtx_CONST_VECTOR (V16QImode,
 						     gen_rtvec_v (16, perm)));
 }
 
+/* For the load and sign extend rightmost elements; load and zero extend
+ rightmost element builtins.  */
+static rtx
+altivec_expand_lxvr_builtin (enum insn_code icode, tree exp, rtx target, bool blk, bool sign_extend)
+{
+  rtx pat, addr;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  machine_mode tmode = insn_data[icode].operand[0].mode;
+  machine_mode smode = insn_data[icode].operand[1].mode;
+  machine_mode mode0 = Pmode;
+  machine_mode mode1 = Pmode;
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+
+  if (icode == CODE_FOR_nothing)
+    /* Builtin not supported on this processor.  */
+    return 0;
+
+  /* If we got invalid arguments bail out before generating bad rtl.  */
+  if (arg0 == error_mark_node || arg1 == error_mark_node)
+    return const0_rtx;
+
+  if (target == 0
+      || GET_MODE (target) != tmode
+      || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  op1 = copy_to_mode_reg (mode1, op1);
+
+  if (op0 == const0_rtx)
+    addr = gen_rtx_MEM (blk ? BLKmode : tmode, op1);
+  else
+    {
+      op0 = copy_to_mode_reg (mode0, op0);
+      addr = gen_rtx_MEM (blk ? BLKmode : smode,
+			  gen_rtx_PLUS (Pmode, op1, op0));
+    }
+
+  if (sign_extend)
+    {
+      rtx discratch = gen_reg_rtx (DImode);
+      rtx tiscratch = gen_reg_rtx (TImode);
+
+      /* Emit the lxvr*x insn.  */
+      pat = GEN_FCN (icode) (tiscratch, addr);
+      if (!pat)
+	return 0;
+      emit_insn (pat);
+
+      /* Emit a sign extension from QI,HI,WI to double (DI).  */
+      rtx scratch = gen_lowpart (smode, tiscratch);
+      if (icode == CODE_FOR_vsx_lxvrbx)
+	emit_insn (gen_extendqidi2 (discratch, scratch));
+      else if (icode == CODE_FOR_vsx_lxvrhx)
+	emit_insn (gen_extendhidi2 (discratch, scratch));
+      else if (icode == CODE_FOR_vsx_lxvrwx)
+	emit_insn (gen_extendsidi2 (discratch, scratch));
+      /*  Assign discratch directly if scratch is already DI.  */
+      if (icode == CODE_FOR_vsx_lxvrdx)
+	discratch = scratch;
+
+      /* Emit the sign extension from DI (double) to TI (quad).  */
+      emit_insn (gen_extendditi2 (target, discratch));
+
+      return target;
+    }
+  else
+    {
+      /* Zero extend.  */
+      pat = GEN_FCN (icode) (target, addr);
+      if (!pat)
+	return 0;
+      emit_insn (pat);
+      return target;
+    }
+  return 0;
+}
+
 static rtx
 altivec_expand_lv_builtin (enum insn_code icode, tree exp, rtx target, bool blk)
 {
   rtx pat, addr;
   tree arg0 = CALL_EXPR_ARG (exp, 0);
@@ -9690,11 +9828,11 @@  altivec_expand_stv_builtin (enum insn_code icode, tree exp)
   tree arg1 = CALL_EXPR_ARG (exp, 1);
   tree arg2 = CALL_EXPR_ARG (exp, 2);
   rtx op0 = expand_normal (arg0);
   rtx op1 = expand_normal (arg1);
   rtx op2 = expand_normal (arg2);
-  rtx pat, addr, rawaddr;
+  rtx pat, addr, rawaddr, truncrtx;
   machine_mode tmode = insn_data[icode].operand[0].mode;
   machine_mode smode = insn_data[icode].operand[1].mode;
   machine_mode mode1 = Pmode;
   machine_mode mode2 = Pmode;
 
@@ -9729,10 +9867,29 @@  altivec_expand_stv_builtin (enum insn_code icode, tree exp)
 
       op0 = copy_to_mode_reg (tmode, op0);
 
       emit_insn (gen_rtx_SET (addr, op0));
     }
+  else if (icode == CODE_FOR_vsx_stxvrbx
+	   || icode == CODE_FOR_vsx_stxvrhx
+	   || icode == CODE_FOR_vsx_stxvrwx
+	   || icode == CODE_FOR_vsx_stxvrdx)
+    {
+      truncrtx = gen_rtx_TRUNCATE (tmode, op0);
+      op0 = copy_to_mode_reg (E_TImode, truncrtx);
+
+      if (op1 == const0_rtx)
+	addr = gen_rtx_MEM (Pmode, op2);
+      else
+	{
+	  op1 = copy_to_mode_reg (mode1, op1);
+	  addr = gen_rtx_MEM (tmode, gen_rtx_PLUS (Pmode, op2, op1));
+	}
+      pat = GEN_FCN (icode) (addr, op0);
+      if (pat)
+	emit_insn (pat);
+    }
   else
     {
       if (! (*insn_data[icode].operand[1].predicate) (op0, smode))
 	op0 = copy_to_mode_reg (smode, op0);
 
@@ -10748,10 +10905,20 @@  altivec_expand_builtin (tree exp, rtx target, bool *expandedp)
       return altivec_expand_stv_builtin (CODE_FOR_altivec_stvebx, exp);
     case ALTIVEC_BUILTIN_STVEHX:
       return altivec_expand_stv_builtin (CODE_FOR_altivec_stvehx, exp);
     case ALTIVEC_BUILTIN_STVEWX:
       return altivec_expand_stv_builtin (CODE_FOR_altivec_stvewx, exp);
+
+    case P10_BUILTIN_TR_STXVRBX:
+      return altivec_expand_stv_builtin (CODE_FOR_vsx_stxvrbx, exp);
+    case P10_BUILTIN_TR_STXVRHX:
+      return altivec_expand_stv_builtin (CODE_FOR_vsx_stxvrhx, exp);
+    case P10_BUILTIN_TR_STXVRWX:
+      return altivec_expand_stv_builtin (CODE_FOR_vsx_stxvrwx, exp);
+    case P10_BUILTIN_TR_STXVRDX:
+      return altivec_expand_stv_builtin (CODE_FOR_vsx_stxvrdx, exp);
+
     case ALTIVEC_BUILTIN_STVXL_V2DF:
       return altivec_expand_stv_builtin (CODE_FOR_altivec_stvxl_v2df, exp);
     case ALTIVEC_BUILTIN_STVXL_V2DI:
       return altivec_expand_stv_builtin (CODE_FOR_altivec_stvxl_v2di, exp);
     case ALTIVEC_BUILTIN_STVXL_V4SF:
@@ -11010,10 +11177,34 @@  altivec_expand_builtin (tree exp, rtx target, bool *expandedp)
       return altivec_expand_lv_builtin (CODE_FOR_altivec_lvehx,
 					exp, target, false);
     case ALTIVEC_BUILTIN_LVEWX:
       return altivec_expand_lv_builtin (CODE_FOR_altivec_lvewx,
 					exp, target, false);
+    case P10_BUILTIN_SE_LXVRBX:
+      return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrbx,
+					exp, target, false, true);
+    case P10_BUILTIN_SE_LXVRHX:
+      return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrhx,
+					exp, target, false, true);
+    case P10_BUILTIN_SE_LXVRWX:
+      return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrwx,
+					exp, target, false, true);
+    case P10_BUILTIN_SE_LXVRDX:
+      return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrdx,
+					exp, target, false, true);
+    case P10_BUILTIN_ZE_LXVRBX:
+      return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrbx,
+					exp, target, false, false);
+    case P10_BUILTIN_ZE_LXVRHX:
+      return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrhx,
+					exp, target, false, false);
+    case P10_BUILTIN_ZE_LXVRWX:
+      return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrwx,
+					exp, target, false, false);
+    case P10_BUILTIN_ZE_LXVRDX:
+      return altivec_expand_lxvr_builtin (CODE_FOR_vsx_lxvrdx,
+					exp, target, false, false);
     case ALTIVEC_BUILTIN_LVXL_V2DF:
       return altivec_expand_lv_builtin (CODE_FOR_altivec_lvxl_v2df,
 					exp, target, false);
     case ALTIVEC_BUILTIN_LVXL_V2DI:
       return altivec_expand_lv_builtin (CODE_FOR_altivec_lvxl_v2di,
@@ -13294,10 +13485,22 @@  altivec_init_builtins (void)
   def_builtin ("__builtin_altivec_lvsl", v16qi_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVSL);
   def_builtin ("__builtin_altivec_lvsr", v16qi_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVSR);
   def_builtin ("__builtin_altivec_lvebx", v16qi_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVEBX);
   def_builtin ("__builtin_altivec_lvehx", v8hi_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVEHX);
   def_builtin ("__builtin_altivec_lvewx", v4si_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVEWX);
+  def_builtin ("__builtin_altivec_se_lxvrbx", v16qi_ftype_long_pcvoid, P10_BUILTIN_SE_LXVRBX);
+  def_builtin ("__builtin_altivec_se_lxvrhx", v8hi_ftype_long_pcvoid, P10_BUILTIN_SE_LXVRHX);
+  def_builtin ("__builtin_altivec_se_lxvrwx", v4si_ftype_long_pcvoid, P10_BUILTIN_SE_LXVRWX);
+  def_builtin ("__builtin_altivec_se_lxvrdx", v2di_ftype_long_pcvoid, P10_BUILTIN_SE_LXVRDX);
+  def_builtin ("__builtin_altivec_ze_lxvrbx", v16qi_ftype_long_pcvoid, P10_BUILTIN_ZE_LXVRBX);
+  def_builtin ("__builtin_altivec_ze_lxvrhx", v8hi_ftype_long_pcvoid, P10_BUILTIN_ZE_LXVRHX);
+  def_builtin ("__builtin_altivec_ze_lxvrwx", v4si_ftype_long_pcvoid, P10_BUILTIN_ZE_LXVRWX);
+  def_builtin ("__builtin_altivec_ze_lxvrdx", v2di_ftype_long_pcvoid, P10_BUILTIN_ZE_LXVRDX);
+  def_builtin ("__builtin_altivec_tr_stxvrbx", void_ftype_v1ti_long_pvoid, P10_BUILTIN_TR_STXVRBX);
+  def_builtin ("__builtin_altivec_tr_stxvrhx", void_ftype_v1ti_long_pvoid, P10_BUILTIN_TR_STXVRHX);
+  def_builtin ("__builtin_altivec_tr_stxvrwx", void_ftype_v1ti_long_pvoid, P10_BUILTIN_TR_STXVRWX);
+  def_builtin ("__builtin_altivec_tr_stxvrdx", void_ftype_v1ti_long_pvoid, P10_BUILTIN_TR_STXVRDX);
   def_builtin ("__builtin_altivec_lvxl", v4si_ftype_long_pcvoid, ALTIVEC_BUILTIN_LVXL);
   def_builtin ("__builtin_altivec_lvxl_v2df", v2df_ftype_long_pcvoid,
 	       ALTIVEC_BUILTIN_LVXL_V2DF);
   def_builtin ("__builtin_altivec_lvxl_v2di", v2di_ftype_long_pcvoid,
 	       ALTIVEC_BUILTIN_LVXL_V2DI);
@@ -13359,10 +13562,13 @@  altivec_init_builtins (void)
   def_builtin ("__builtin_vec_lvsl", v16qi_ftype_long_pcvoid, ALTIVEC_BUILTIN_VEC_LVSL);
   def_builtin ("__builtin_vec_lvsr", v16qi_ftype_long_pcvoid, ALTIVEC_BUILTIN_VEC_LVSR);
   def_builtin ("__builtin_vec_lvebx", v16qi_ftype_long_pcvoid, ALTIVEC_BUILTIN_VEC_LVEBX);
   def_builtin ("__builtin_vec_lvehx", v8hi_ftype_long_pcvoid, ALTIVEC_BUILTIN_VEC_LVEHX);
   def_builtin ("__builtin_vec_lvewx", v4si_ftype_long_pcvoid, ALTIVEC_BUILTIN_VEC_LVEWX);
+  def_builtin ("__builtin_vec_se_lxvrx", v1ti_ftype_long_pcvoid, P10_BUILTIN_VEC_SE_LXVRX);
+  def_builtin ("__builtin_vec_ze_lxvrx", v1ti_ftype_long_pcvoid, P10_BUILTIN_VEC_ZE_LXVRX);
+  def_builtin ("__builtin_vec_tr_stxvrx", void_ftype_opaque_long_pvoid, P10_BUILTIN_VEC_TR_STXVRX);
   def_builtin ("__builtin_vec_st", void_ftype_opaque_long_pvoid, ALTIVEC_BUILTIN_VEC_ST);
   def_builtin ("__builtin_vec_ste", void_ftype_opaque_long_pvoid, ALTIVEC_BUILTIN_VEC_STE);
   def_builtin ("__builtin_vec_stl", void_ftype_opaque_long_pvoid, ALTIVEC_BUILTIN_VEC_STL);
   def_builtin ("__builtin_vec_stvewx", void_ftype_opaque_long_pvoid, ALTIVEC_BUILTIN_VEC_STVEWX);
   def_builtin ("__builtin_vec_stvebx", void_ftype_opaque_long_pvoid, ALTIVEC_BUILTIN_VEC_STVEBX);
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index f7bb4550544e..d960c95dc729 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1253,10 +1253,28 @@ 
       rs6000_emit_le_vsx_move (operands[0], operands[1], <MODE>mode);
       DONE;
     }
 })
 
+;; Load rightmost element from load_data
+;; using lxvrbx, lxvrhx, lxvrwx, lxvrdx.
+(define_insn "vsx_lxvr<wd>x"
+  [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
+	(zero_extend:TI (match_operand:INT_ISA3  1 "memory_operand" "Z")))]
+  "TARGET_POWER10"
+  "lxvr<wd>x %x0,%y1"
+  [(set_attr "type" "vecload")])
+
+;; Store rightmost element into store_data
+;; using stxvrbx, stxvrhx, strvxwx, strvxdx.
+(define_insn "vsx_stxvr<wd>x"
+  [(set (match_operand:INT_ISA3 0 "memory_operand" "=Z")
+	(truncate:INT_ISA3 (match_operand:TI 1 "vsx_register_operand" "wa")))]
+  "TARGET_POWER10"
+  "stxvr<wd>x %x1,%y0"
+  [(set_attr "type" "vecstore")])
+
 ;; Explicit load/store expanders for the builtin functions for lxvd2x, etc.,
 ;; when you really want their element-reversing behavior.
 (define_insn "vsx_ld_elemrev_v2di"
   [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
         (vec_select:V2DI
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 62549b024523..b1bbaea13852 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -17658,10 +17658,41 @@  Perform a 64-bit parallel bits deposit operation, as if implemented by the
 @end smallexample
 Perform a 64-bit parallel bits extract operation, as if implemented by the
 @code{pextd} instruction.
 @findex __builtin_pextd
 
+@smallexample
+@exdent vector signed __int128 vsx_xl_sext (signed long long, signed char *);
+@exdent vector signed __int128 vsx_xl_sext (signed long long, signed short *);
+@exdent vector signed __int128 vsx_xl_sext (signed long long, signed int *);
+@exdent vector signed __int128 vsx_xl_sext (signed long long, signed long long *);
+@exdent vector unsigned __int128 vsx_xl_zext (signed long long, unsigned char *);
+@exdent vector unsigned __int128 vsx_xl_zext (signed long long, unsigned short *);
+@exdent vector unsigned __int128 vsx_xl_zext (signed long long, unsigned int *);
+@exdent vector unsigned __int128 vsx_xl_zext (signed long long, unsigned long long *);
+@end smallexample
+
+Load (and sign extend) to an __int128 vector, as if implemented by the ISA 3.1
+@code{lxvrbx} @code{lxvrhx} @code{lxvrwx} @code{lxvrdx} instructions.
+@findex vsx_xl_sext
+@findex vsx_xl_zext
+
+@smallexample
+@exdent void vec_xst_trunc (vector signed __int128, signed long long, signed char *);
+@exdent void vec_xst_trunc (vector signed __int128, signed long long, signed short *);
+@exdent void vec_xst_trunc (vector signed __int128, signed long long, signed int *);
+@exdent void vec_xst_trunc (vector signed __int128, signed long long, signed long long *);
+@exdent void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned char *);
+@exdent void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned short *);
+@exdent void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned int *);
+@exdent void vec_xst_trunc (vector unsigned __int128, signed long long, unsigned long long *);
+@end smallexample
+
+Truncate and store the rightmost element of a vector, as if implemented by the
+ISA 3.1 @code{stxvrbx} @code{stxvrhx} @code{stxvrwx} @code{stxvrdx} instructions.
+@findex vec_xst_trunc
+
 @node PowerPC AltiVec/VSX Built-in Functions
 @subsection PowerPC AltiVec/VSX Built-in Functions
 
 GCC provides an interface for the PowerPC family of processors to access
 the AltiVec operations described in Motorola's AltiVec Programming
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-char.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-char.c
new file mode 100644
index 000000000000..0b8cfd610f88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-char.c
@@ -0,0 +1,170 @@ 
+/*
+   Test of vec_xl_sext and vec_xl_zext (load into rightmost
+   vector element and zero/sign extend). */
+
+/* { dg-do compile {target power10_ok} } */
+/* { dg-do run {target power10_hw} } */
+/* { dg-options "-mdejagnu-cpu=power10 -O3" } */
+
+/* At the time of writing, the number of lxvrbx instructions is
+   double what we expect because we are generating a 
+   .constprop copy of the function.  */
+/* { dg-final { scan-assembler-times {\mlxvrbx\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlbx\M} 0 } } */
+
+#define NUM_VEC_ELEMS 16
+#define ITERS 16
+
+/*
+Codegen at time of writing is a lxvrbx for both the
+zero and sign extended tests.  The sign extension test
+also uses mfvsr*d, extsb, mtvsrdd, vextsd2q.
+
+0000000010000c90 <test_sign_extended_load>:
+    10000c90:	1a 18 04 7c 	lxvrbx  vs0,r4,r3
+    10000c94:	66 00 0b 7c 	mfvsrd  r11,vs0
+    10000c98:	66 02 0a 7c 	mfvsrld r10,vs0
+    10000c9c:	74 07 4a 7d 	extsb   r10,r10
+    10000ca0:	67 53 40 7c 	mtvsrdd vs34,0,r10
+    10000ca4:	02 16 5b 10 	vextsd2q v2,v2
+    10000ca8:	20 00 80 4e 	blr
+
+0000000010000cc0 <test_zero_extended_unsigned_load>:
+    10000cc0:	1b 18 44 7c 	lxvrbx  vs34,r4,r3
+    10000cc4:	20 00 80 4e 	blr
+*/
+
+#include <altivec.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+
+long long buffer[8];
+unsigned long verbose=0;
+
+char initbuffer[64] = {
+	0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+			0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x80,
+	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
+			0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0x90,
+	0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+			0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xa0,
+	0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+			0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xb0
+};
+
+vector signed __int128 signed_expected[16] = {
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000000011},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000000012},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000000013},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000000014},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000000015},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000000016},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000000017},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000000018},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffffff89},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffffff8a},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffffff8b},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffffff8c},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffffff8d},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffffff8e},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffffff8f},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffffff80}
+};
+
+vector unsigned __int128 unsigned_expected[16] = {
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000011},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000012},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000013},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000014},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000015},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000016},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000017},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000018},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000089},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x000000000000008a},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x000000000000008b},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x000000000000008c},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x000000000000008d},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x000000000000008e},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x000000000000008f},
+	{ (unsigned __int128) 0x0000000000000000 << 64 | (unsigned __int128) 0x0000000000000080}
+};
+
+__attribute__ ((noinline))
+vector signed __int128 test_sign_extended_load(int RA, signed char * RB) {
+	return vec_xl_sext (RA, RB);
+}
+
+__attribute__ ((noinline))
+vector unsigned __int128 test_zero_extended_unsigned_load(int RA, unsigned char * RB) {
+	return vec_xl_zext (RA, RB);
+}
+
+int main (int argc, char *argv [])
+{
+   int iteration=0;
+   int mismatch=0;
+   vector signed __int128 signed_result_v;
+   vector unsigned __int128 unsigned_result_v;
+#if VERBOSE
+   verbose=1;
+   printf("%s %s\n", __DATE__, __TIME__);
+#endif
+
+  memcpy(&buffer, &initbuffer, sizeof(buffer));
+
+   if (verbose) {
+	   printf("input buffer:\n");
+	   for (int k=0;k<64;k++) {
+		   printf("%x ",initbuffer[k]);
+		   if (k && (k+1)%16==0) printf("\n");
+	   }
+	   printf("signed_expected:\n");
+	   for (int k=0;k<ITERS;k++) {
+		printf("%llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		   printf("\n");
+	   }
+	   printf("unsigned_expected:\n");
+	   for (int k=0;k<ITERS;k++) {
+		printf("%llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		   printf("\n");
+	   }
+   }
+
+   for (iteration = 0; iteration < ITERS ; iteration++ ) {
+      signed_result_v = test_sign_extended_load (iteration, (signed char*)buffer);
+      if (signed_result_v[0] != signed_expected[iteration][0] ) {
+		mismatch++;
+		printf("Unexpected results from signed load. i=%d \n", iteration);
+		printf("got:      %llx ",signed_result_v[0]>>64);
+		printf(" %llx \n",signed_result_v[0]&0xffffffffffffffff);
+		printf("expected: %llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		fflush(stdout);
+      }
+   }
+
+   for (iteration = 0; iteration < ITERS ; iteration++ ) {
+      unsigned_result_v = test_zero_extended_unsigned_load (iteration, (unsigned char*)buffer);
+      if (unsigned_result_v[0] != unsigned_expected[iteration][0]) {
+		mismatch++;
+		printf("Unexpected results from unsigned load. i=%d \n", iteration);
+		printf("got:      %llx ",unsigned_result_v[0]>>64);
+		printf(" %llx \n",unsigned_result_v[0]&0xffffffffffffffff);
+		printf("expected: %llx ",unsigned_expected[iteration][0]>>64);
+		printf(" %llx \n",unsigned_expected[iteration][0]&0xffffffffffffffff);
+		fflush(stdout);
+      }
+   }
+
+   if (mismatch) {
+      printf("%d mismatches. \n",mismatch);
+      abort();
+   }
+   return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
new file mode 100644
index 000000000000..b10d3cb43d2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
@@ -0,0 +1,168 @@ 
+/*
+   Test of vec_xl_sext and vec_xl_zext (load into rightmost
+   vector element and zero/sign extend). */
+
+/* { dg-do compile {target power10_ok} } */
+/* { dg-do run {target power10_hw} } */
+
+/* Deliberately set optization to zero for this test to confirm
+   the lxvr*x instruction is generated. At higher optimization levels
+   the instruction we are looking for is sometimes replaced by other
+   load instructions. */
+/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+
+/* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */
+
+#define NUM_VEC_ELEMS 4
+#define ITERS 16
+
+/*
+Codegen at time of writing is a single lxvrwx for the zero
+extended test, and a lwax,mtvsrdd,vextsd2q for the sign
+extended test.
+
+0000000010000c90 <test_sign_extended_load>:
+    10000c90:	aa 1a 24 7d 	lwax    r9,r4,r3
+    10000c94:	67 4b 40 7c 	mtvsrdd vs34,0,r9
+    10000c98:	02 16 5b 10 	vextsd2q v2,v2
+    10000c9c:	20 00 80 4e 	blr
+
+0000000010000cb0 <test_zero_extended_unsigned_load>:
+    10000cb0:	9b 18 44 7c 	lxvrwx  vs34,r4,r3
+    10000cb4:	20 00 80 4e 	blr
+*/
+
+#include <altivec.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+
+long long buffer[8];
+unsigned long verbose=0;
+
+char initbuffer[64] = {
+	0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+			0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x80,
+	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
+			0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0x90,
+	0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+			0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xa0,
+	0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+			0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xb0
+};
+
+vector signed __int128 signed_expected[16] = {
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000014131211},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000015141312},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000016151413},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000017161514},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000018171615},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffff89181716},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffff8a891817},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffff8b8a8918},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffff8c8b8a89},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffff8d8c8b8a},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffff8e8d8c8b},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffff8f8e8d8c},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffff808f8e8d},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000021808f8e},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x000000002221808f},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000023222180}
+};
+
+vector unsigned __int128 unsigned_expected[16] = {
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000014131211},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000015141312},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000016151413},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000017161514},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000018171615},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000089181716},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x000000008a891817},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x000000008b8a8918},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x000000008c8b8a89},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x000000008d8c8b8a},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x000000008e8d8c8b},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x000000008f8e8d8c},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000808f8e8d},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000021808f8e},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x000000002221808f},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000023222180}
+};
+
+__attribute__ ((noinline))
+vector signed __int128 test_sign_extended_load(int RA, signed int * RB) {
+	return vec_xl_sext (RA, RB);
+}
+
+__attribute__ ((noinline))
+vector unsigned __int128 test_zero_extended_unsigned_load(int RA, unsigned int * RB) {
+	return vec_xl_zext (RA, RB);
+}
+
+int main (int argc, char *argv [])
+{
+   int iteration=0;
+   int mismatch=0;
+   vector signed __int128 signed_result_v;
+   vector unsigned __int128 unsigned_result_v;
+#if VERBOSE
+   verbose=1;
+   printf("%s %s\n", __DATE__, __TIME__);
+#endif
+
+  memcpy(&buffer, &initbuffer, sizeof(buffer));
+
+   if (verbose) {
+	   printf("input buffer:\n");
+	   for (int k=0;k<64;k++) {
+		   printf("%x ",initbuffer[k]);
+		   if (k && (k+1)%16==0) printf("\n");
+	   }
+	   printf("signed_expected:\n");
+	   for (int k=0;k<ITERS;k++) {
+		printf("%llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		   printf("\n");
+	   }
+	   printf("unsigned_expected:\n");
+	   for (int k=0;k<ITERS;k++) {
+		printf("%llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		   printf("\n");
+	   }
+   }
+
+   for (iteration = 0; iteration < ITERS ; iteration++ ) {
+      signed_result_v = test_sign_extended_load (iteration, (signed int*)buffer);
+      if (signed_result_v[0] != signed_expected[iteration][0] ) {
+		mismatch++;
+		printf("Unexpected results from signed load. i=%d \n", iteration);
+		printf("got:      %llx ",signed_result_v[0]>>64);
+		printf(" %llx \n",signed_result_v[0]&0xffffffffffffffff);
+		printf("expected: %llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		fflush(stdout);
+      }
+   }
+
+   for (iteration = 0; iteration < ITERS ; iteration++ ) {
+      unsigned_result_v = test_zero_extended_unsigned_load (iteration, (unsigned int*)buffer);
+      if (unsigned_result_v[0] != unsigned_expected[iteration][0]) {
+		mismatch++;
+		printf("Unexpected results from unsigned load. i=%d \n", iteration);
+		printf("got:      %llx ",unsigned_result_v[0]>>64);
+		printf(" %llx \n",unsigned_result_v[0]&0xffffffffffffffff);
+		printf("expected: %llx ",unsigned_expected[iteration][0]>>64);
+		printf(" %llx \n",unsigned_expected[iteration][0]&0xffffffffffffffff);
+		fflush(stdout);
+      }
+   }
+
+   if (mismatch) {
+      printf("%d mismatches. \n",mismatch);
+      abort();
+   }
+   return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-longlong.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-longlong.c
new file mode 100644
index 000000000000..52fcf2e572f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-longlong.c
@@ -0,0 +1,169 @@ 
+/*
+   Test of vec_xl_sext and vec_xl_zext (load into rightmost
+   vector element and zero/sign extend). */
+
+/* { dg-do compile {target power10_ok} } */
+/* { dg-do run {target power10_hw} } */
+/* { dg-options "-mdejagnu-cpu=power10 -O3" } */
+
+/* At time of writing, we also geenerate a .constrprop copy
+   of the function, so our instruction hit count is
+   twice of what we would otherwise expect.  */
+/* { dg-final { scan-assembler-times {\mlxvrdx\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlvdx\M} 0 } } */
+
+#define NUM_VEC_ELEMS 2
+#define ITERS 16
+
+/*
+Codegen at time of writing uses lxvrdx for both sign and
+zero extend tests. The sign extended test also uses
+mfvsr*d, mtvsrdd, vextsd2q.
+
+0000000010000c90 <test_sign_extended_load>:
+    10000c90:	da 18 04 7c 	lxvrdx  vs0,r4,r3
+    10000c94:	66 00 0b 7c 	mfvsrd  r11,vs0
+    10000c98:	66 02 0a 7c 	mfvsrld r10,vs0
+    10000c9c:	67 53 40 7c 	mtvsrdd vs34,0,r10
+    10000ca0:	02 16 5b 10 	vextsd2q v2,v2
+    10000ca4:	20 00 80 4e 	blr
+
+0000000010000cc0 <test_zero_extended_unsigned_load>:
+    10000cc0:	db 18 44 7c 	lxvrdx  vs34,r4,r3
+    10000cc4:	20 00 80 4e 	blr
+*/
+
+#include <altivec.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+
+long long buffer[8];
+unsigned long verbose=0;
+
+char initbuffer[64] = {
+	0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+			0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x80,
+	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
+			0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0x90,
+	0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+			0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xa0,
+	0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+			0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xb0
+};
+
+vector signed __int128 signed_expected[16] = {
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x1817161514131211},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0x8918171615141312},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0x8a89181716151413},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0x8b8a891817161514},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0x8c8b8a8918171615},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0x8d8c8b8a89181716},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0x8e8d8c8b8a891817},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0x8f8e8d8c8b8a8918},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0x808f8e8d8c8b8a89},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x21808f8e8d8c8b8a},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x2221808f8e8d8c8b},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x232221808f8e8d8c},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x24232221808f8e8d},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x2524232221808f8e},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x262524232221808f},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x2726252423222180}
+};
+
+vector unsigned __int128 unsigned_expected[16] = {
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x1817161514131211},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x8918171615141312},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x8a89181716151413},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x8b8a891817161514},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x8c8b8a8918171615},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x8d8c8b8a89181716},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x8e8d8c8b8a891817},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x8f8e8d8c8b8a8918},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x808f8e8d8c8b8a89},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x21808f8e8d8c8b8a},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x2221808f8e8d8c8b},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x232221808f8e8d8c},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x24232221808f8e8d},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x2524232221808f8e},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x262524232221808f},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x2726252423222180}
+};
+
+__attribute__ ((noinline))
+vector signed __int128 test_sign_extended_load(int RA, signed long long * RB) {
+	return vec_xl_sext (RA, RB);
+}
+
+__attribute__ ((noinline))
+vector unsigned __int128 test_zero_extended_unsigned_load(int RA, unsigned long long * RB) {
+	return vec_xl_zext (RA, RB);
+}
+
+int main (int argc, char *argv [])
+{
+   int iteration=0;
+   int mismatch=0;
+   vector signed __int128 signed_result_v;
+   vector unsigned __int128 unsigned_result_v;
+#if VERBOSE
+   verbose=1;
+   printf("%s %s\n", __DATE__, __TIME__);
+#endif
+
+  memcpy(&buffer, &initbuffer, sizeof(buffer));
+
+   if (verbose) {
+	   printf("input buffer:\n");
+	   for (int k=0;k<64;k++) {
+		   printf("%x ",initbuffer[k]);
+		   if (k && (k+1)%16==0) printf("\n");
+	   }
+	   printf("signed_expected:\n");
+	   for (int k=0;k<ITERS;k++) {
+		printf("%llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		   printf("\n");
+	   }
+	   printf("unsigned_expected:\n");
+	   for (int k=0;k<ITERS;k++) {
+		printf("%llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		   printf("\n");
+	   }
+   }
+
+   for (iteration = 0; iteration < ITERS ; iteration++ ) {
+      signed_result_v = test_sign_extended_load (iteration, (signed long long*)buffer);
+      if (signed_result_v[0] != signed_expected[iteration][0] ) {
+		mismatch++;
+		printf("Unexpected results from signed load. i=%d \n", iteration);
+		printf("got:      %llx ",signed_result_v[0]>>64);
+		printf(" %llx \n",signed_result_v[0]&0xffffffffffffffff);
+		printf("expected: %llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		fflush(stdout);
+      }
+   }
+
+   for (iteration = 0; iteration < ITERS ; iteration++ ) {
+      unsigned_result_v = test_zero_extended_unsigned_load (iteration, (unsigned long long*)buffer);
+      if (unsigned_result_v[0] != unsigned_expected[iteration][0]) {
+		mismatch++;
+		printf("Unexpected results from unsigned load. i=%d \n", iteration);
+		printf("got:      %llx ",unsigned_result_v[0]>>64);
+		printf(" %llx \n",unsigned_result_v[0]&0xffffffffffffffff);
+		printf("expected: %llx ",unsigned_expected[iteration][0]>>64);
+		printf(" %llx \n",unsigned_expected[iteration][0]&0xffffffffffffffff);
+		fflush(stdout);
+      }
+   }
+
+   if (mismatch) {
+      printf("%d mismatches. \n",mismatch);
+      abort();
+   }
+   return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
new file mode 100644
index 000000000000..8fc0cc66eb72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
@@ -0,0 +1,168 @@ 
+/*
+   Test of vec_xl_sext and vec_xl_zext (load into rightmost
+   vector element and zero/sign extend). */
+
+/* { dg-do compile {target power10_ok} } */
+/* { dg-do run {target power10_hw} } */
+
+/* Deliberately set optization to zero for this test to confirm
+   the lxvr*x instruction is generated. At higher optimization levels
+   the instruction we are looking for is sometimes replaced by other
+   load instructions.  */
+/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+
+/* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */
+
+#define NUM_VEC_ELEMS 8
+#define ITERS 16
+
+/*
+Codegen at time of writing uses lxvrhx for the zero
+extension test and lhax,mtvsrdd,vextsd2q for the
+sign extended test.
+
+0000000010001810 <test_sign_extended_load>:
+    10001810:	ae 1a 24 7d 	lhax    r9,r4,r3
+    10001814:	67 4b 40 7c 	mtvsrdd vs34,0,r9
+    10001818:	02 16 5b 10 	vextsd2q v2,v2
+    1000181c:	20 00 80 4e 	blr
+
+0000000010001830 <test_zero_extended_unsigned_load>:
+    10001830:	5b 18 44 7c 	lxvrhx  vs34,r4,r3
+    10001834:	20 00 80 4e 	blr
+*/
+
+#include <altivec.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+
+long long buffer[8];
+unsigned long verbose=0;
+
+char initbuffer[64] = {
+	0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+			0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x80,
+	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
+			0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0x90,
+	0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+			0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xa0,
+	0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+			0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xb0
+};
+
+vector signed __int128 signed_expected[16] = {
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000001211},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000001312},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000001413},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000001514},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000001615},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000001716},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000001817},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffff8918},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffff8a89},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffff8b8a},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffff8c8b},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffff8d8c},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffff8e8d},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffff8f8e},
+	{ (__int128) 0xffffffffffffffff << 64 | (__int128) 0xffffffffffff808f},
+	{ (__int128) 0x0000000000000000 << 64 | (__int128) 0x0000000000002180}
+};
+
+vector unsigned __int128 unsigned_expected[16] = {
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000001211},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000001312},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000001413},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000001514},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000001615},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000001716},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000001817},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000008918},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000008a89},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000008b8a},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000008c8b},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000008d8c},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000008e8d},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000008f8e},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x0000000000000808f},
+	{ (unsigned __int128) 0x0000000000000000  << 64 | (unsigned __int128) 0x00000000000002180}
+};
+
+__attribute__ ((noinline))
+vector signed __int128 test_sign_extended_load(int RA, signed short * RB) {
+	return vec_xl_sext (RA, RB);
+}
+
+__attribute__ ((noinline))
+vector unsigned __int128 test_zero_extended_unsigned_load(int RA, unsigned short * RB) {
+	return vec_xl_zext (RA, RB);
+}
+
+int main (int argc, char *argv [])
+{
+   int iteration=0;
+   int mismatch=0;
+   vector signed __int128 signed_result_v;
+   vector unsigned __int128 unsigned_result_v;
+#if VERBOSE
+   verbose=1;
+   printf("%s %s\n", __DATE__, __TIME__);
+#endif
+
+  memcpy(&buffer, &initbuffer, sizeof(buffer));
+
+   if (verbose) {
+	   printf("input buffer:\n");
+	   for (int k=0;k<64;k++) {
+		   printf("%x ",initbuffer[k]);
+		   if (k && (k+1)%16==0) printf("\n");
+	   }
+	   printf("signed_expected:\n");
+	   for (int k=0;k<ITERS;k++) {
+		printf("%llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		   printf("\n");
+	   }
+	   printf("unsigned_expected:\n");
+	   for (int k=0;k<ITERS;k++) {
+		printf("%llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		   printf("\n");
+	   }
+   }
+
+   for (iteration = 0; iteration < ITERS ; iteration++ ) {
+      signed_result_v = test_sign_extended_load (iteration, (signed short*)buffer);
+      if (signed_result_v[0] != signed_expected[iteration][0] ) {
+		mismatch++;
+		printf("Unexpected results from signed load. i=%d \n", iteration);
+		printf("got:      %llx ",signed_result_v[0]>>64);
+		printf(" %llx \n",signed_result_v[0]&0xffffffffffffffff);
+		printf("expected: %llx ",signed_expected[iteration][0]>>64);
+		printf(" %llx \n",signed_expected[iteration][0]&0xffffffffffffffff);
+		fflush(stdout);
+      }
+   }
+
+   for (iteration = 0; iteration < ITERS ; iteration++ ) {
+      unsigned_result_v = test_zero_extended_unsigned_load (iteration, (unsigned short*)buffer);
+      if (unsigned_result_v[0] != unsigned_expected[iteration][0]) {
+		mismatch++;
+		printf("Unexpected results from unsigned load. i=%d \n", iteration);
+		printf("got:      %llx ",unsigned_result_v[0]>>64);
+		printf(" %llx \n",unsigned_result_v[0]&0xffffffffffffffff);
+		printf("expected: %llx ",unsigned_expected[iteration][0]>>64);
+		printf(" %llx \n",unsigned_expected[iteration][0]&0xffffffffffffffff);
+		fflush(stdout);
+      }
+   }
+
+   if (mismatch) {
+      printf("%d mismatches. \n",mismatch);
+      abort();
+   }
+   return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-char.c b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-char.c
new file mode 100644
index 000000000000..99f3904983be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-char.c
@@ -0,0 +1,125 @@ 
+/*
+   Test of vec_xst_trunc (truncate and store rightmost vector element) */
+
+/* { dg-do compile {target power10_ok} } */
+/* { dg-do run {target power10_hw} } */
+/* Deliberately set optization to zero for this test to confirm
+   the stxvr*x instruction is generated. At higher optimization levels
+   the instruction we are looking for is sometimes replaced by other
+   store instructions.  */
+/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+
+/* { dg-final { scan-assembler-times {\mstxvrbx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstbx\M} 0 } } */
+
+#include <altivec.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+
+vector signed __int128 store_this_s[4] = {
+{ (__int128) 0x7000000000000000 << 64 | (__int128) 0x123456789abcdef8ULL},
+{ (__int128) 0x8000000000000000 << 64 | (__int128) 0xfedcba9876543217ULL},
+{ (__int128) 0x1000000000000000 << 64 | (__int128) 0xccccccccccccccccULL},
+{ (__int128) 0xf000000000000000 << 64 | (__int128) 0xaaaaaaaaaaaaaaaaULL}
+};
+
+vector unsigned __int128 store_this_us[4] = {
+{ (unsigned __int128) 0x7000000000000000 << 64 | (unsigned __int128) 0x123456789abcdef8ULL},
+{ (unsigned __int128) 0x8000000000000000 << 64 | (unsigned __int128) 0xfedcba9876543217ULL},
+{ (unsigned __int128) 0x1000000000000000 << 64 | (unsigned __int128) 0xeeeeeeeeeeeeeeeeULL},
+{ (unsigned __int128) 0xf000000000000000 << 64 | (unsigned __int128) 0x5555555555555555ULL}
+};
+
+#define NUM_VEC_ELEMS 16
+
+vector signed char signed_expected[4] = {
+	{ 0xf8, 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0},
+	{ 0x0 , 0x0, 0x0, 0x0, 0x17, 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0},
+	{ 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0xcc, 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0},
+	{ 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0xaa, 0x0, 0x0, 0x0}
+};
+vector unsigned char unsigned_expected[4] = {
+	{ 0xf8, 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0},
+	{ 0x0 , 0x0, 0x0, 0x0, 0x17, 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0},
+	{ 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0xee, 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0},
+	{ 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x0 , 0x0, 0x0, 0x0, 0x55, 0x0, 0x0, 0x0}
+};
+
+unsigned long long rawbuffer[32];
+signed char * vsbuffer = (char *)rawbuffer;
+unsigned char * vubuffer = (unsigned char *)rawbuffer;
+
+void reset_buffer() {
+	memset (&rawbuffer,0,sizeof(rawbuffer));
+}
+
+#define PRINT_VEC(V) \
+   for (int j=0;j<NUM_VEC_ELEMS;j++) {	printf ("(0x%lx) ", V[j] ); }
+
+void test_signed_store(vector signed __int128 myvec, int offset, signed char * store_data ) {
+	vec_xst_trunc (myvec, offset, store_data );
+}
+
+void test_unsigned_store(vector unsigned __int128 myvec, int offset, unsigned char * store_data )   {
+	vec_xst_trunc (myvec, offset, store_data );
+}
+
+int main (int argc, char *argv [])
+{
+   int i;
+   int memcmpresult;
+   int mismatch=0;
+   int verbose=0;
+
+#if VERBOSE
+   verbose=1;
+   printf("%s %s\n", __DATE__, __TIME__);
+#endif
+
+   if (verbose) {
+      printf("expected results from signed tests:\n");
+      for (i = 0; i < 4 ; i++ ) {
+	 PRINT_VEC(signed_expected[i]);
+	 printf("\n");
+      }
+   }
+
+   for (i = 0; i < 4 ; i++ ) {
+      reset_buffer();
+      test_signed_store (store_this_s[i], 4*i, vsbuffer);
+      memcmpresult = memcmp(rawbuffer,&signed_expected[i],sizeof(vector char));
+      if (memcmpresult) {
+	 printf("mismatch signed buffer, i %d (memcmpresult:%d) \n",i,memcmpresult);
+	 mismatch++;
+	 if (verbose) {
+	    printf("results: ");
+	    PRINT_VEC(vsbuffer);
+	    printf("\n");
+	 }
+      }
+   }
+
+   for (i = 0; i < 4 ; i++ ) {
+      reset_buffer();
+      test_unsigned_store (store_this_us[i], 4*i, vubuffer);
+      memcmpresult = memcmp(rawbuffer,&unsigned_expected[i],sizeof(vector char));
+      if (memcmpresult) {
+	 printf("mismatch unsigned buffer, i %d (memcmpresult:%d) \n",i,memcmpresult);
+	 mismatch++;
+	 if (verbose) {
+	    printf("results :");
+	    PRINT_VEC(vubuffer);
+	    printf("\n");
+	 }
+      }
+   }
+
+   if (mismatch) {
+      printf("%d mismatches. \n",mismatch);
+      abort();
+   }
+   return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-int.c b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-int.c
new file mode 100644
index 000000000000..6e2acf83c383
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-int.c
@@ -0,0 +1,125 @@ 
+/*
+   Test of vec_xst_trunc (truncate and store rightmost vector element) */
+
+/* { dg-do compile {target power10_ok} } */
+/* { dg-do run {target power10_hw} } */
+/* Deliberately set optization to zero for this test to confirm
+   the stxvr*x instruction is generated. At higher optimization levels
+   the instruction we are looking for is sometimes replaced by other
+   store instructions.  */
+/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+ 
+/* { dg-final { scan-assembler-times {\mstxvrwx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstwx\M} 0 } } */
+
+#include <altivec.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+
+vector signed __int128 store_this_s[4] = {
+{ (__int128) 0x7000000000000000 << 64 | (__int128) 0x123456789abcdef8ULL},
+{ (__int128) 0x8000000000000000 << 64 | (__int128) 0xfedcba9876543217ULL},
+{ (__int128) 0x1000000000000000 << 64 | (__int128) 0xccccccccccccccccULL},
+{ (__int128) 0xf000000000000000 << 64 | (__int128) 0xaaaaaaaaaaaaaaaaULL}
+};
+
+vector unsigned __int128 store_this_us[4] = {
+{ (unsigned __int128) 0x7000000000000000 << 64 | (unsigned __int128) 0x123456789abcdef8ULL},
+{ (unsigned __int128) 0x8000000000000000 << 64 | (unsigned __int128) 0xfedcba9876543217ULL},
+{ (unsigned __int128) 0x1000000000000000 << 64 | (unsigned __int128) 0xeeeeeeeeeeeeeeeeULL},
+{ (unsigned __int128) 0xf000000000000000 << 64 | (unsigned __int128) 0x5555555555555555ULL}
+};
+
+#define NUM_VEC_ELEMS 4
+
+vector signed int signed_expected[4] = {
+	{0x9abcdef8, 0x0       , 0x0       , 0x0        },
+	{0x0       , 0x76543217, 0x0       , 0x0        },
+	{0x0       , 0x0       , 0xcccccccc, 0x0        },
+	{0x0       , 0x0       , 0x0       , 0xaaaaaaaa },
+};
+vector unsigned int unsigned_expected[4] = {
+	{0x9abcdef8, 0x0       , 0x0       , 0x0        },
+	{0x0       , 0x76543217, 0x0       , 0x0        },
+	{0x0       , 0x0       , 0xeeeeeeee, 0x0        },
+	{0x0       , 0x0       , 0x0       , 0x55555555 },
+};
+
+unsigned long long rawbuffer[32];
+signed int * vsbuffer = (int *)rawbuffer;
+unsigned int * vubuffer = (unsigned int *)rawbuffer;
+
+void reset_buffer() {
+	memset (&rawbuffer,0,sizeof(rawbuffer));
+}
+
+#define PRINT_VEC(V) \
+   for (int j=0;j<NUM_VEC_ELEMS;j++) {	printf ("(0x%lx) ", V[j] ); }
+
+void test_signed_store(vector signed __int128 myvec, int offset, signed int * store_data ) {
+	vec_xst_trunc (myvec, offset, store_data);
+}
+
+void test_unsigned_store(vector unsigned __int128 myvec, int offset, unsigned int * store_data ) {
+	vec_xst_trunc (myvec, offset, store_data);
+}
+
+int main (int argc, char *argv [])
+{
+   int i;
+   int memcmpresult;
+   int mismatch=0;
+   int verbose=0;
+
+#if VERBOSE
+   verbose=1;
+   printf("%s %s\n", __DATE__, __TIME__);
+#endif
+
+   if (verbose) {
+      printf("expected results from signed tests:\n");
+      for (i = 0; i < 4 ; i++ ) {
+	 PRINT_VEC(signed_expected[i]);
+	 printf("\n");
+      }
+   }
+
+   for (i = 0; i < 4 ; i++ ) {
+      reset_buffer();
+      test_signed_store (store_this_s[i], 4*i, vsbuffer);
+      memcmpresult = memcmp(rawbuffer,&signed_expected[i],sizeof(vector int));
+      if (memcmpresult) {
+	 printf("mismatch signed buffer, i %d (memcmpresult:%d) \n",i,memcmpresult);
+	 mismatch++;
+	 if (verbose) {
+	    printf("results: ");
+	    PRINT_VEC(vsbuffer);
+	    printf("\n");
+	 }
+      }
+   }
+
+   for (i = 0; i < 4 ; i++ ) {
+      reset_buffer();
+      test_unsigned_store (store_this_us[i], 4*i, vubuffer);
+      memcmpresult = memcmp(rawbuffer,&unsigned_expected[i],sizeof(vector int));
+      if (memcmpresult) {
+	 printf("mismatch unsigned buffer, i %d (memcmpresult:%d) \n",i,memcmpresult);
+	 mismatch++;
+	 if (verbose) {
+	    printf("results :");
+	    PRINT_VEC(vubuffer);
+	    printf("\n");
+	 }
+      }
+   }
+
+   if (mismatch) {
+      printf("%d mismatches. \n",mismatch);
+      abort();
+   }
+   return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-longlong.c b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-longlong.c
new file mode 100644
index 000000000000..7fce6a44d4f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-longlong.c
@@ -0,0 +1,126 @@ 
+/*
+   Test of vec_xst_trunc (truncate and store rightmost vector element) */
+
+/* { dg-do compile {target power10_ok} } */
+/* { dg-do run {target power10_hw} } */
+
+/* Deliberately set optization to zero for this test to confirm
+   the stxvr*x instruction is generated. At higher optimization levels
+   the instruction we are looking for is sometimes replaced by other
+   store instructions. */
+/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+
+/* { dg-final { scan-assembler-times {\mstxvrdx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstwx\M} 0 } } */
+
+#include <altivec.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+
+vector signed __int128 store_this_s[4] = {
+{ (__int128) 0x7000000000000000 << 64 | (__int128) 0x123456789abcdef8ULL},
+{ (__int128) 0x8000000000000000 << 64 | (__int128) 0xfedcba9876543217ULL},
+{ (__int128) 0x1000000000000000 << 64 | (__int128) 0xccccccccccccccccULL},
+{ (__int128) 0xf000000000000000 << 64 | (__int128) 0xaaaaaaaaaaaaaaaaULL}
+};
+
+vector unsigned __int128 store_this_us[4] = {
+{ (unsigned __int128) 0x7000000000000000 << 64 | (unsigned __int128) 0x123456789abcdef8ULL},
+{ (unsigned __int128) 0x8000000000000000 << 64 | (unsigned __int128) 0xfedcba9876543217ULL},
+{ (unsigned __int128) 0x1000000000000000 << 64 | (unsigned __int128) 0xeeeeeeeeeeeeeeeeULL},
+{ (unsigned __int128) 0xf000000000000000 << 64 | (unsigned __int128) 0x5555555555555555ULL}
+};
+
+#define NUM_VEC_ELEMS 2
+
+vector signed long long signed_expected[5] = {
+	{ 0x123456789abcdef8,                0x0},
+	{ 0x7654321700000000,         0xfedcba98},
+	{ 0x0000000000000000, 0xcccccccccccccccc},
+	{ 0x0000000000000000, 0xaaaaaaaa00000000}  /*note that some data written into the next word */
+};
+vector unsigned long long unsigned_expected[5] = {
+	{ 0x123456789abcdef8,                0x0},
+	{ 0x7654321700000000,         0xfedcba98},
+	{ 0x0000000000000000, 0xeeeeeeeeeeeeeeee},
+	{ 0x0000000000000000, 0x5555555500000000}
+};
+
+unsigned long long rawbuffer[32];
+signed long long * vsbuffer = (long long *)rawbuffer;
+unsigned long long * vubuffer = (unsigned long long *)rawbuffer;
+
+void reset_buffer() {
+	memset (&rawbuffer,0,sizeof(rawbuffer));
+}
+
+#define PRINT_VEC(V) \
+   for (int j=0;j<NUM_VEC_ELEMS;j++) {	printf ("(0x%lx) ", V[j] ); }
+
+void test_signed_store(vector signed __int128 myvec, int offset, signed long long * store_data ) {
+	vec_xst_trunc (myvec, offset, store_data);
+}
+
+void test_unsigned_store(vector unsigned __int128 myvec, int offset, unsigned long long * store_data )   {
+	vec_xst_trunc (myvec, offset, store_data);
+}
+
+int main (int argc, char *argv [])
+{
+   int i;
+   int memcmpresult;
+   int mismatch=0;
+   int verbose=0;
+
+#if VERBOSE
+   verbose=1;
+   printf("%s %s\n", __DATE__, __TIME__);
+#endif
+
+   if (verbose) {
+      printf("expected results from signed tests:\n");
+      for (i = 0; i < 4 ; i++ ) {
+	 PRINT_VEC(signed_expected[i]);
+	 printf("\n");
+      }
+   }
+
+   for (i = 0; i < 4 ; i++ ) {
+      reset_buffer();
+      test_signed_store (store_this_s[i], 4*i, vsbuffer);
+      memcmpresult = memcmp(rawbuffer,&signed_expected[i],sizeof(vector long long));
+      if (memcmpresult) {
+	 printf("mismatch signed buffer, i %d (memcmpresult:%d) \n",i,memcmpresult);
+	 mismatch++;
+	 if (verbose) {
+	    printf("results: ");
+	    PRINT_VEC(vsbuffer);
+	    printf("\n");
+	 }
+      }
+   }
+
+   for (i = 0; i < 4 ; i++ ) {
+      reset_buffer();
+      test_unsigned_store (store_this_us[i], 4*i, vubuffer);
+      memcmpresult = memcmp(rawbuffer,&unsigned_expected[i],sizeof(vector long long));
+      if (memcmpresult) {
+	 printf("mismatch unsigned buffer, i %d (memcmpresult:%d) \n",i,memcmpresult);
+	 mismatch++;
+	 if (verbose) {
+	    printf("results :");
+	    PRINT_VEC(vubuffer);
+	    printf("\n");
+	 }
+      }
+   }
+
+   if (mismatch) {
+      printf("%d mismatches. \n",mismatch);
+      abort();
+   }
+   return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-short.c b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-short.c
new file mode 100644
index 000000000000..17925c87732e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-short.c
@@ -0,0 +1,126 @@ 
+/*
+   Test of vec_xst_trunc (truncate and store rightmost vector element) */
+
+/* { dg-do compile {target power10_ok} } */
+/* { dg-do run {target power10_hw} } */
+
+/* Deliberately set optization to zero for this test to confirm
+   the stxvr*x instruction is generated. At higher optimization levels
+   the instruction we are looking for is sometimes replaced by other
+   store instructions.  */
+/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+
+/* { dg-final { scan-assembler-times {\mstxvrhx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\msthx\M} 0 } } */
+
+#include <altivec.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdlib.h>
+
+vector signed __int128 store_this_s[4] = {
+{ (__int128) 0x7000000000000000 << 64 | (__int128) 0x123456789abcdef8ULL},
+{ (__int128) 0x8000000000000000 << 64 | (__int128) 0xfedcba9876543217ULL},
+{ (__int128) 0x1000000000000000 << 64 | (__int128) 0xccccccccccccccccULL},
+{ (__int128) 0xf000000000000000 << 64 | (__int128) 0xaaaaaaaaaaaaaaaaULL}
+};
+
+vector unsigned __int128 store_this_us[4] = {
+{ (unsigned __int128) 0x7000000000000000 << 64 | (unsigned __int128) 0x123456789abcdef8ULL},
+{ (unsigned __int128) 0x8000000000000000 << 64 | (unsigned __int128) 0xfedcba9876543217ULL},
+{ (unsigned __int128) 0x1000000000000000 << 64 | (unsigned __int128) 0xeeeeeeeeeeeeeeeeULL},
+{ (unsigned __int128) 0xf000000000000000 << 64 | (unsigned __int128) 0x5555555555555555ULL}
+};
+
+#define NUM_VEC_ELEMS 8
+
+vector signed short signed_expected[4] = {
+	{0xdef8, 0x0, 0x0   , 0x0, 0x0   , 0x0, 0x0   , 0x0},
+	{0x0   , 0x0, 0x3217, 0x0, 0x0   , 0x0, 0x0   , 0x0},
+	{0x0   , 0x0, 0x0   , 0x0, 0xcccc, 0x0, 0x0   , 0x0},
+	{0x0   , 0x0, 0x0   , 0x0, 0x0   , 0x0, 0xaaaa, 0x0}
+	};
+vector unsigned short unsigned_expected[4] = {
+	{0xdef8, 0x0, 0x0   , 0x0, 0x0   , 0x0, 0x0   , 0x0},
+	{0x0   , 0x0, 0x3217, 0x0, 0x0   , 0x0, 0x0   , 0x0},
+	{0x0   , 0x0, 0x0   , 0x0, 0xeeee, 0x0, 0x0   , 0x0},
+	{0x0   , 0x0, 0x0   , 0x0, 0x0   , 0x0, 0x5555, 0x0}
+};
+
+unsigned long long rawbuffer[32];
+signed short * vsbuffer = (short *)rawbuffer;
+unsigned short * vubuffer = (unsigned short *)rawbuffer;
+
+void reset_buffer() {
+	memset (&rawbuffer,0,sizeof(rawbuffer));
+}
+
+#define PRINT_VEC(V) \
+   for (int j=0;j<NUM_VEC_ELEMS;j++) {	printf ("(0x%lx) ", V[j] ); }
+
+void test_signed_store(vector signed __int128 myvec, int offset, signed short * store_data ) {
+	vec_xst_trunc (myvec, offset, store_data);
+}
+
+void test_unsigned_store(vector unsigned __int128 myvec, int offset, unsigned short * store_data )   {
+	vec_xst_trunc (myvec, offset, store_data);
+}
+
+int main (int argc, char *argv [])
+{
+   int i;
+   int memcmpresult;
+   int mismatch=0;
+   int verbose=0;
+
+#if VERBOSE
+   verbose=1;
+   printf("%s %s\n", __DATE__, __TIME__);
+#endif
+
+   if (verbose) {
+      printf("expected results from signed tests:\n");
+      for (i = 0; i < 4 ; i++ ) {
+	 PRINT_VEC(signed_expected[i]);
+	 printf("\n");
+      }
+   }
+
+   for (i = 0; i < 4 ; i++ ) {
+      reset_buffer();
+      test_signed_store (store_this_s[i], 4*i, vsbuffer);
+      memcmpresult = memcmp(rawbuffer,&signed_expected[i],sizeof(vector short));
+      if (memcmpresult) {
+	 printf("mismatch signed buffer, i %d (memcmpresult:%d) \n",i,memcmpresult);
+	 mismatch++;
+	 if (verbose) {
+	    printf("results: ");
+	    PRINT_VEC(vsbuffer);
+	    printf("\n");
+	 }
+      }
+   }
+
+   for (i = 0; i < 4 ; i++ ) {
+      reset_buffer();
+      test_unsigned_store (store_this_us[i], 4*i, vubuffer);
+      memcmpresult = memcmp(rawbuffer,&unsigned_expected[i],sizeof(vector short));
+      if (memcmpresult) {
+	 printf("mismatch unsigned buffer, i %d (memcmpresult:%d) \n",i,memcmpresult);
+	 mismatch++;
+	 if (verbose) {
+	    printf("results :");
+	    PRINT_VEC(vubuffer);
+	    printf("\n");
+	 }
+      }
+   }
+
+   if (mismatch) {
+      printf("%d mismatches. \n",mismatch);
+      abort();
+   }
+   return 0;
+}
+