diff mbox series

middle-end: check if target can do extract first for early breaks [PR113199]

Message ID patch-18119-tamar@arm.com
State New
Headers show
Series middle-end: check if target can do extract first for early breaks [PR113199] | expand

Commit Message

Tamar Christina Jan. 2, 2024, 5:57 p.m. UTC
Hi All,

I was generating the vector reverse mask without checking if the target
actually supported such an operation.

It also seems like more targets implement VEC_EXTRACT than permute on mask
registers.

So this adds a check for IFN_VEC_EXTRACT support when required and changes
the select first code to use it.

This is good for now since masks always come from whilelo.  But in the future
when masks can come from other sources we will need the old code back.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues with --enable-checking=release --enable-lto
--with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
tested on cross cc1 for amdgcn-amdhsa and issue fixed.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR tree-optimization/113199
	* tree-vect-loop.cc (vectorizable_live_operation_1): Use
	IFN_VEC_EXTRACT.
	(vectorizable_live_operation): Check for IFN_VEC_EXTRACT support.

gcc/testsuite/ChangeLog:

	PR tree-optimization/113199
	* gcc.target/gcn/pr113199.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.target/gcn/pr113199.c b/gcc/testsuite/gcc.target/gcn/pr113199.c
new file mode 100644
index 0000000000000000000000000000000000000000..8a641e5536e80e207ca0163cac66c0f4f6ca93f7




--
diff --git a/gcc/testsuite/gcc.target/gcn/pr113199.c b/gcc/testsuite/gcc.target/gcn/pr113199.c
new file mode 100644
index 0000000000000000000000000000000000000000..8a641e5536e80e207ca0163cac66c0f4f6ca93f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/pr113199.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+
+typedef long unsigned int size_t;
+typedef int wchar_t;
+struct tm
+{
+  int tm_mon;
+  int tm_year;
+};
+int abs (int);
+struct lc_time_T { const char *month[12]; };
+struct __locale_t * __get_current_locale (void) { }
+const struct lc_time_T * __get_time_locale (struct __locale_t *locale) { }
+const wchar_t * __ctloc (wchar_t *buf, const char *elem, size_t *len_ret) { return buf; }
+size_t
+__strftime (wchar_t *s, size_t maxsize, const wchar_t *format,
+     const struct tm *tim_p, struct __locale_t *locale)
+{
+  size_t count = 0;
+  const wchar_t *ctloc;
+  wchar_t ctlocbuf[256];
+  size_t i, ctloclen;
+  const struct lc_time_T *_CurrentTimeLocale = __get_time_locale (locale);
+    {
+      switch (*format)
+ {
+ case L'B':
+   (ctloc = __ctloc (ctlocbuf, _CurrentTimeLocale->month[tim_p->tm_mon], &ctloclen));
+   for (i = 0; i < ctloclen; i++)
+     {
+       if (count < maxsize - 1)
+  s[count++] = ctloc[i];
+       else
+  return 0;
+       {
+  int century = tim_p->tm_year >= 0
+    ? tim_p->tm_year / 100 + 1900 / 100
+    : abs (tim_p->tm_year + 1900) / 100;
+       }
+   }
+ }
+    }
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 37f1be1101ffae779214056a0886411e0683e887..5aa92e67444e7aacf458fffa1428f1983c482374 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10648,36 +10648,18 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
 				      &LOOP_VINFO_MASKS (loop_vinfo),
 				      1, vectype, 0);
       tree scalar_res;
+      gimple_seq_add_seq (&stmts, tem);
 
       /* For an inverted control flow with early breaks we want EXTRACT_FIRST
-	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+	 instead of EXTRACT_LAST.  For now since the mask always comes from a
+	 WHILELO we can get the first element ignoring the mask since CLZ of the
+	 mask will always be zero.  */
       if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
-	{
-	  /* First create the permuted mask.  */
-	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
-	  tree perm_dest = copy_ssa_name (mask);
-	  gimple *perm_stmt
-		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
-				       mask, perm_mask);
-	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
-				       &gsi);
-	  mask = perm_dest;
-
-	  /* Then permute the vector contents.  */
-	  tree perm_elem = perm_mask_for_reverse (vectype);
-	  perm_dest = copy_ssa_name (vec_lhs_phi);
-	  perm_stmt
-		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
-				       vec_lhs_phi, perm_elem);
-	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
-				       &gsi);
-	  vec_lhs_phi = perm_dest;
-	}
-
-      gimple_seq_add_seq (&stmts, tem);
-
-      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-				 mask, vec_lhs_phi);
+	scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+				   vec_lhs_phi, bitstart);
+      else
+	scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				   mask, vec_lhs_phi);
 
       /* Convert the extracted vector element to the scalar type.  */
       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
@@ -10852,9 +10834,25 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	      gcc_assert (ncopies == 1 && !slp_node);
 	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
 						  OPTIMIZE_FOR_SPEED))
-		vect_record_loop_mask (loop_vinfo,
-				       &LOOP_VINFO_MASKS (loop_vinfo),
-				       1, vectype, NULL);
+		{
+		  if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
+		      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+		      && !direct_internal_fn_supported_p (IFN_EXTRACT_LAST,
+							  vectype,
+							  OPTIMIZE_FOR_SPEED))
+		    {
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				"can't operate on partial vectors "
+				"because the target doesn't support extract "
+				"first reduction.\n");
+		      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+		    }
+		  else
+		    vect_record_loop_mask (loop_vinfo,
+					   &LOOP_VINFO_MASKS (loop_vinfo),
+					   1, vectype, NULL);
+		}
 	      else if (can_vec_extract_var_idx_p (
 			 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
 		vect_record_loop_len (loop_vinfo,

Comments

Richard Biener Jan. 8, 2024, 12:47 p.m. UTC | #1
On Tue, 2 Jan 2024, Tamar Christina wrote:

> Hi All,
> 
> I was generating the vector reverse mask without checking if the target
> actually supported such an operation.
> 
> It also seems like more targets implement VEC_EXTRACT than permute on mask
> registers.
> 
> So this adds a check for IFN_VEC_EXTRACT support when required and changes
> the select first code to use it.
> 
> This is good for now since masks always come from whilelo.  But in the future
> when masks can come from other sources we will need the old code back.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues with --enable-checking=release --enable-lto
> --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> tested on cross cc1 for amdgcn-amdhsa and issue fixed.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	PR tree-optimization/113199
> 	* tree-vect-loop.cc (vectorizable_live_operation_1): Use
> 	IFN_VEC_EXTRACT.
> 	(vectorizable_live_operation): Check for IFN_VEC_EXTRACT support.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR tree-optimization/113199
> 	* gcc.target/gcn/pr113199.c: New test.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/testsuite/gcc.target/gcn/pr113199.c b/gcc/testsuite/gcc.target/gcn/pr113199.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..8a641e5536e80e207ca0163cac66c0f4f6ca93f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/gcn/pr113199.c
> @@ -0,0 +1,44 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +
> +typedef long unsigned int size_t;
> +typedef int wchar_t;
> +struct tm
> +{
> +  int tm_mon;
> +  int tm_year;
> +};
> +int abs (int);
> +struct lc_time_T { const char *month[12]; };
> +struct __locale_t * __get_current_locale (void) { }
> +const struct lc_time_T * __get_time_locale (struct __locale_t *locale) { }
> +const wchar_t * __ctloc (wchar_t *buf, const char *elem, size_t *len_ret) { return buf; }
> +size_t
> +__strftime (wchar_t *s, size_t maxsize, const wchar_t *format,
> +     const struct tm *tim_p, struct __locale_t *locale)
> +{
> +  size_t count = 0;
> +  const wchar_t *ctloc;
> +  wchar_t ctlocbuf[256];
> +  size_t i, ctloclen;
> +  const struct lc_time_T *_CurrentTimeLocale = __get_time_locale (locale);
> +    {
> +      switch (*format)
> + {
> + case L'B':
> +   (ctloc = __ctloc (ctlocbuf, _CurrentTimeLocale->month[tim_p->tm_mon], &ctloclen));
> +   for (i = 0; i < ctloclen; i++)
> +     {
> +       if (count < maxsize - 1)
> +  s[count++] = ctloc[i];
> +       else
> +  return 0;
> +       {
> +  int century = tim_p->tm_year >= 0
> +    ? tim_p->tm_year / 100 + 1900 / 100
> +    : abs (tim_p->tm_year + 1900) / 100;
> +       }
> +   }
> + }
> +    }
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 37f1be1101ffae779214056a0886411e0683e887..5aa92e67444e7aacf458fffa1428f1983c482374 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10648,36 +10648,18 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>  				      &LOOP_VINFO_MASKS (loop_vinfo),
>  				      1, vectype, 0);
>        tree scalar_res;
> +      gimple_seq_add_seq (&stmts, tem);
>  
>        /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> -	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> +	 instead of EXTRACT_LAST.  For now since the mask always comes from a
> +	 WHILELO we can get the first element ignoring the mask since CLZ of the
> +	 mask will always be zero.  */
>        if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> -	{
> -	  /* First create the permuted mask.  */
> -	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> -	  tree perm_dest = copy_ssa_name (mask);
> -	  gimple *perm_stmt
> -		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> -				       mask, perm_mask);
> -	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> -				       &gsi);
> -	  mask = perm_dest;
> -
> -	  /* Then permute the vector contents.  */
> -	  tree perm_elem = perm_mask_for_reverse (vectype);
> -	  perm_dest = copy_ssa_name (vec_lhs_phi);
> -	  perm_stmt
> -		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> -				       vec_lhs_phi, perm_elem);
> -	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> -				       &gsi);
> -	  vec_lhs_phi = perm_dest;
> -	}
> -
> -      gimple_seq_add_seq (&stmts, tem);
> -
> -      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -				 mask, vec_lhs_phi);
> +	scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +				   vec_lhs_phi, bitstart);

So bitstart is always zero?  I wonder why using CFN_VEC_EXTRACT over
BIT_FIELD_REF here which wouldn't need any additional target support.

> +      else
> +	scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> +				   mask, vec_lhs_phi);
>  
>        /* Convert the extracted vector element to the scalar type.  */
>        new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> @@ -10852,9 +10834,25 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	      gcc_assert (ncopies == 1 && !slp_node);
>  	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
>  						  OPTIMIZE_FOR_SPEED))
> -		vect_record_loop_mask (loop_vinfo,
> -				       &LOOP_VINFO_MASKS (loop_vinfo),
> -				       1, vectype, NULL);
> +		{
> +		  if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
> +		      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +		      && !direct_internal_fn_supported_p (IFN_EXTRACT_LAST,
> +							  vectype,
> +							  OPTIMIZE_FOR_SPEED))

I don't quite understand this part - this is guarded by

direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
                                                 OPTIMIZE_FOR_SPEED)

unless I'm mis-matching this means the checked 
!direct_internal_fn_supported_p condition is always false?

> +		    {
> +		      if (dump_enabled_p ())
> +			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +				"can't operate on partial vectors "
> +				"because the target doesn't support extract "
> +				"first reduction.\n");
> +		      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +		    }
> +		  else
> +		    vect_record_loop_mask (loop_vinfo,
> +					   &LOOP_VINFO_MASKS (loop_vinfo),
> +					   1, vectype, NULL);
> +		}
>  	      else if (can_vec_extract_var_idx_p (
>  			 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
>  		vect_record_loop_len (loop_vinfo,
> 
> 
> 
> 
>
Tamar Christina Jan. 8, 2024, 2:19 p.m. UTC | #2
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, January 8, 2024 12:48 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: Re: [PATCH]middle-end: check if target can do extract first for early breaks
> [PR113199]
> 
> On Tue, 2 Jan 2024, Tamar Christina wrote:
> 
> > Hi All,
> >
> > I was generating the vector reverse mask without checking if the target
> > actually supported such an operation.
> >
> > It also seems like more targets implement VEC_EXTRACT than permute on mask
> > registers.
> >
> > So this adds a check for IFN_VEC_EXTRACT support when required and changes
> > the select first code to use it.
> >
> > This is good for now since masks always come from whilelo.  But in the future
> > when masks can come from other sources we will need the old code back.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues with --enable-checking=release --enable-lto
> > --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> > tested on cross cc1 for amdgcn-amdhsa and issue fixed.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	PR tree-optimization/113199
> > 	* tree-vect-loop.cc (vectorizable_live_operation_1): Use
> > 	IFN_VEC_EXTRACT.
> > 	(vectorizable_live_operation): Check for IFN_VEC_EXTRACT support.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	PR tree-optimization/113199
> > 	* gcc.target/gcn/pr113199.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/testsuite/gcc.target/gcn/pr113199.c
> b/gcc/testsuite/gcc.target/gcn/pr113199.c
> > new file mode 100644
> > index
> 0000000000000000000000000000000000000000..8a641e5536e80e207ca01
> 63cac66c0f4f6ca93f7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/gcn/pr113199.c
> > @@ -0,0 +1,44 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O2" } */
> > +
> > +typedef long unsigned int size_t;
> > +typedef int wchar_t;
> > +struct tm
> > +{
> > +  int tm_mon;
> > +  int tm_year;
> > +};
> > +int abs (int);
> > +struct lc_time_T { const char *month[12]; };
> > +struct __locale_t * __get_current_locale (void) { }
> > +const struct lc_time_T * __get_time_locale (struct __locale_t *locale) { }
> > +const wchar_t * __ctloc (wchar_t *buf, const char *elem, size_t *len_ret) {
> return buf; }
> > +size_t
> > +__strftime (wchar_t *s, size_t maxsize, const wchar_t *format,
> > +     const struct tm *tim_p, struct __locale_t *locale)
> > +{
> > +  size_t count = 0;
> > +  const wchar_t *ctloc;
> > +  wchar_t ctlocbuf[256];
> > +  size_t i, ctloclen;
> > +  const struct lc_time_T *_CurrentTimeLocale = __get_time_locale (locale);
> > +    {
> > +      switch (*format)
> > + {
> > + case L'B':
> > +   (ctloc = __ctloc (ctlocbuf, _CurrentTimeLocale->month[tim_p->tm_mon],
> &ctloclen));
> > +   for (i = 0; i < ctloclen; i++)
> > +     {
> > +       if (count < maxsize - 1)
> > +  s[count++] = ctloc[i];
> > +       else
> > +  return 0;
> > +       {
> > +  int century = tim_p->tm_year >= 0
> > +    ? tim_p->tm_year / 100 + 1900 / 100
> > +    : abs (tim_p->tm_year + 1900) / 100;
> > +       }
> > +   }
> > + }
> > +    }
> > +}
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index
> 37f1be1101ffae779214056a0886411e0683e887..5aa92e67444e7aacf458fffa14
> 28f1983c482374 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -10648,36 +10648,18 @@ vectorizable_live_operation_1 (loop_vec_info
> loop_vinfo,
> >  				      &LOOP_VINFO_MASKS (loop_vinfo),
> >  				      1, vectype, 0);
> >        tree scalar_res;
> > +      gimple_seq_add_seq (&stmts, tem);
> >
> >        /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> > -	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> > +	 instead of EXTRACT_LAST.  For now since the mask always comes from a
> > +	 WHILELO we can get the first element ignoring the mask since CLZ of the
> > +	 mask will always be zero.  */
> >        if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > -	{
> > -	  /* First create the permuted mask.  */
> > -	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> > -	  tree perm_dest = copy_ssa_name (mask);
> > -	  gimple *perm_stmt
> > -		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> > -				       mask, perm_mask);
> > -	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> > -				       &gsi);
> > -	  mask = perm_dest;
> > -
> > -	  /* Then permute the vector contents.  */
> > -	  tree perm_elem = perm_mask_for_reverse (vectype);
> > -	  perm_dest = copy_ssa_name (vec_lhs_phi);
> > -	  perm_stmt
> > -		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> > -				       vec_lhs_phi, perm_elem);
> > -	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> > -				       &gsi);
> > -	  vec_lhs_phi = perm_dest;
> > -	}
> > -
> > -      gimple_seq_add_seq (&stmts, tem);
> > -
> > -      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > -				 mask, vec_lhs_phi);
> > +	scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE
> (vectype),
> > +				   vec_lhs_phi, bitstart);
> 
> So bitstart is always zero?  I wonder why using CFN_VEC_EXTRACT over
> BIT_FIELD_REF here which wouldn't need any additional target support.
> 

Hmm Unless it's recently changed, I believe BIT_FIELD_REF doesn't support
VLA types no?  I guess maybe it does for index 0?  But I was pretty sure it asserts.

> > +      else
> > +	scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > +				   mask, vec_lhs_phi);
> >
> >        /* Convert the extracted vector element to the scalar type.  */
> >        new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > @@ -10852,9 +10834,25 @@ vectorizable_live_operation (vec_info *vinfo,
> stmt_vec_info stmt_info,
> >  	      gcc_assert (ncopies == 1 && !slp_node);
> >  	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> >  						  OPTIMIZE_FOR_SPEED))
> > -		vect_record_loop_mask (loop_vinfo,
> > -				       &LOOP_VINFO_MASKS (loop_vinfo),
> > -				       1, vectype, NULL);
> > +		{
> > +		  if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
> > +		      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > +		      && !direct_internal_fn_supported_p (IFN_EXTRACT_LAST,
> > +							  vectype,
> > +							  OPTIMIZE_FOR_SPEED))
> 
> I don't quite understand this part - this is guarded by
> 
> direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
>                                                  OPTIMIZE_FOR_SPEED)
> 
> unless I'm mis-matching this means the checked
> !direct_internal_fn_supported_p condition is always false?

Arg,, sorry should be IFN_ VEC_EXTRACT.

I'll fix and see if BIT_FIELD_REF works for 0 index.

Thanks,
Tamar
> 
> > +		    {
> > +		      if (dump_enabled_p ())
> > +			dump_printf_loc (MSG_MISSED_OPTIMIZATION,
> vect_location,
> > +				"can't operate on partial vectors "
> > +				"because the target doesn't support extract "
> > +				"first reduction.\n");
> > +		      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) =
> false;
> > +		    }
> > +		  else
> > +		    vect_record_loop_mask (loop_vinfo,
> > +					   &LOOP_VINFO_MASKS (loop_vinfo),
> > +					   1, vectype, NULL);
> > +		}
> >  	      else if (can_vec_extract_var_idx_p (
> >  			 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE
> (vectype))))
> >  		vect_record_loop_len (loop_vinfo,
> >
> >
> >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
Tamar Christina Jan. 9, 2024, 11:45 a.m. UTC | #3
> > -
> > -      gimple_seq_add_seq (&stmts, tem);
> > -
> > -      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > -				 mask, vec_lhs_phi);
> > +	scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE
> (vectype),
> > +				   vec_lhs_phi, bitstart);
> 
> So bitstart is always zero?  I wonder why using CFN_VEC_EXTRACT over
> BIT_FIELD_REF here which wouldn't need any additional target support.
> 

Ok, how about...

---

I was generating the vector reverse mask without checking if the target
actually supported such an operation.

This patch changes it to if the bitstart is 0 then use BIT_FIELD_REF instead
to extract the first element since this is supported by all targets.

This is good for now since masks always come from whilelo.  But in the future
when masks can come from other sources we will need the old code back.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues with --enable-checking=release --enable-lto
--with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
tested on cross cc1 for amdgcn-amdhsa and issue fixed.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR tree-optimization/113199
	* tree-vect-loop.cc (vectorizable_live_operation_1): Use
	BIT_FIELD_REF.

gcc/testsuite/ChangeLog:

	PR tree-optimization/113199
	* gcc.target/gcn/pr113199.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.target/gcn/pr113199.c b/gcc/testsuite/gcc.target/gcn/pr113199.c
new file mode 100644
index 0000000000000000000000000000000000000000..8a641e5536e80e207ca0163cac66c0f4f6ca93f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/pr113199.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+
+typedef long unsigned int size_t;
+typedef int wchar_t;
+struct tm
+{
+  int tm_mon;
+  int tm_year;
+};
+int abs (int);
+struct lc_time_T { const char *month[12]; };
+struct __locale_t * __get_current_locale (void) { }
+const struct lc_time_T * __get_time_locale (struct __locale_t *locale) { }
+const wchar_t * __ctloc (wchar_t *buf, const char *elem, size_t *len_ret) { return buf; }
+size_t
+__strftime (wchar_t *s, size_t maxsize, const wchar_t *format,
+     const struct tm *tim_p, struct __locale_t *locale)
+{
+  size_t count = 0;
+  const wchar_t *ctloc;
+  wchar_t ctlocbuf[256];
+  size_t i, ctloclen;
+  const struct lc_time_T *_CurrentTimeLocale = __get_time_locale (locale);
+    {
+      switch (*format)
+ {
+ case L'B':
+   (ctloc = __ctloc (ctlocbuf, _CurrentTimeLocale->month[tim_p->tm_mon], &ctloclen));
+   for (i = 0; i < ctloclen; i++)
+     {
+       if (count < maxsize - 1)
+  s[count++] = ctloc[i];
+       else
+  return 0;
+       {
+  int century = tim_p->tm_year >= 0
+    ? tim_p->tm_year / 100 + 1900 / 100
+    : abs (tim_p->tm_year + 1900) / 100;
+       }
+   }
+ }
+    }
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 37f1be1101ffae779214056a0886411e0683e887..39b1161309d8ff8bfe88ee26df9147df0af0a58c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10592,7 +10592,17 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
 
   gimple_seq stmts = NULL;
   tree new_tree;
-  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+
+  /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
+  if (integer_zerop (bitstart))
+    {
+      tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
+				   vec_lhs_phi, bitsize, bitstart);
+
+      /* Convert the extracted vector element to the scalar type.  */
+      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+    }
+  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
     {
       /* Emit:
 
@@ -10618,12 +10628,6 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
 				     len, bias_minus_one);
 
-      /* This needs to implement extraction of the first index, but not sure
-	 how the LEN stuff works.  At the moment we shouldn't get here since
-	 there's no LEN support for early breaks.  But guard this so there's
-	 no incorrect codegen.  */
-      gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
-
       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
       tree scalar_res
 	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
@@ -10648,32 +10652,6 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
 				      &LOOP_VINFO_MASKS (loop_vinfo),
 				      1, vectype, 0);
       tree scalar_res;
-
-      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
-	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
-      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
-	{
-	  /* First create the permuted mask.  */
-	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
-	  tree perm_dest = copy_ssa_name (mask);
-	  gimple *perm_stmt
-		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
-				       mask, perm_mask);
-	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
-				       &gsi);
-	  mask = perm_dest;
-
-	  /* Then permute the vector contents.  */
-	  tree perm_elem = perm_mask_for_reverse (vectype);
-	  perm_dest = copy_ssa_name (vec_lhs_phi);
-	  perm_stmt
-		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
-				       vec_lhs_phi, perm_elem);
-	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
-				       &gsi);
-	  vec_lhs_phi = perm_dest;
-	}
-
       gimple_seq_add_seq (&stmts, tem);
 
       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
Richard Biener Jan. 9, 2024, 12:07 p.m. UTC | #4
On Tue, 9 Jan 2024, Tamar Christina wrote:

> > > -
> > > -      gimple_seq_add_seq (&stmts, tem);
> > > -
> > > -      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > > -				 mask, vec_lhs_phi);
> > > +	scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE
> > (vectype),
> > > +				   vec_lhs_phi, bitstart);
> > 
> > So bitstart is always zero?  I wonder why using CFN_VEC_EXTRACT over
> > BIT_FIELD_REF here which wouldn't need any additional target support.
> > 
> 
> Ok, how about...
> 
> ---
> 
> I was generating the vector reverse mask without checking if the target
> actually supported such an operation.
> 
> This patch changes it to if the bitstart is 0 then use BIT_FIELD_REF instead
> to extract the first element since this is supported by all targets.
> 
> This is good for now since masks always come from whilelo.  But in the future
> when masks can come from other sources we will need the old code back.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues with --enable-checking=release --enable-lto
> --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> tested on cross cc1 for amdgcn-amdhsa and issue fixed.
> 
> Ok for master?

OK.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	PR tree-optimization/113199
> 	* tree-vect-loop.cc (vectorizable_live_operation_1): Use
> 	BIT_FIELD_REF.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR tree-optimization/113199
> 	* gcc.target/gcn/pr113199.c: New test.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/testsuite/gcc.target/gcn/pr113199.c b/gcc/testsuite/gcc.target/gcn/pr113199.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..8a641e5536e80e207ca0163cac66c0f4f6ca93f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/gcn/pr113199.c
> @@ -0,0 +1,44 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +
> +typedef long unsigned int size_t;
> +typedef int wchar_t;
> +struct tm
> +{
> +  int tm_mon;
> +  int tm_year;
> +};
> +int abs (int);
> +struct lc_time_T { const char *month[12]; };
> +struct __locale_t * __get_current_locale (void) { }
> +const struct lc_time_T * __get_time_locale (struct __locale_t *locale) { }
> +const wchar_t * __ctloc (wchar_t *buf, const char *elem, size_t *len_ret) { return buf; }
> +size_t
> +__strftime (wchar_t *s, size_t maxsize, const wchar_t *format,
> +     const struct tm *tim_p, struct __locale_t *locale)
> +{
> +  size_t count = 0;
> +  const wchar_t *ctloc;
> +  wchar_t ctlocbuf[256];
> +  size_t i, ctloclen;
> +  const struct lc_time_T *_CurrentTimeLocale = __get_time_locale (locale);
> +    {
> +      switch (*format)
> + {
> + case L'B':
> +   (ctloc = __ctloc (ctlocbuf, _CurrentTimeLocale->month[tim_p->tm_mon], &ctloclen));
> +   for (i = 0; i < ctloclen; i++)
> +     {
> +       if (count < maxsize - 1)
> +  s[count++] = ctloc[i];
> +       else
> +  return 0;
> +       {
> +  int century = tim_p->tm_year >= 0
> +    ? tim_p->tm_year / 100 + 1900 / 100
> +    : abs (tim_p->tm_year + 1900) / 100;
> +       }
> +   }
> + }
> +    }
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 37f1be1101ffae779214056a0886411e0683e887..39b1161309d8ff8bfe88ee26df9147df0af0a58c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10592,7 +10592,17 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>  
>    gimple_seq stmts = NULL;
>    tree new_tree;
> -  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +
> +  /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
> +  if (integer_zerop (bitstart))
> +    {
> +      tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
> +				   vec_lhs_phi, bitsize, bitstart);
> +
> +      /* Convert the extracted vector element to the scalar type.  */
> +      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +    }
> +  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>      {
>        /* Emit:
>  
> @@ -10618,12 +10628,6 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
>  				     len, bias_minus_one);
>  
> -      /* This needs to implement extraction of the first index, but not sure
> -	 how the LEN stuff works.  At the moment we shouldn't get here since
> -	 there's no LEN support for early breaks.  But guard this so there's
> -	 no incorrect codegen.  */
> -      gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> -
>        /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
>        tree scalar_res
>  	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> @@ -10648,32 +10652,6 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>  				      &LOOP_VINFO_MASKS (loop_vinfo),
>  				      1, vectype, 0);
>        tree scalar_res;
> -
> -      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> -	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> -      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> -	{
> -	  /* First create the permuted mask.  */
> -	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> -	  tree perm_dest = copy_ssa_name (mask);
> -	  gimple *perm_stmt
> -		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> -				       mask, perm_mask);
> -	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> -				       &gsi);
> -	  mask = perm_dest;
> -
> -	  /* Then permute the vector contents.  */
> -	  tree perm_elem = perm_mask_for_reverse (vectype);
> -	  perm_dest = copy_ssa_name (vec_lhs_phi);
> -	  perm_stmt
> -		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> -				       vec_lhs_phi, perm_elem);
> -	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> -				       &gsi);
> -	  vec_lhs_phi = perm_dest;
> -	}
> -
>        gimple_seq_add_seq (&stmts, tem);
>  
>        scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
>
H.J. Lu Jan. 9, 2024, 4:01 p.m. UTC | #5
On Tue, Jan 9, 2024 at 4:13 AM Richard Biener <rguenther@suse.de> wrote:
>
> On Tue, 9 Jan 2024, Tamar Christina wrote:
>
> > > > -
> > > > -      gimple_seq_add_seq (&stmts, tem);
> > > > -
> > > > -      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > > > -                          mask, vec_lhs_phi);
> > > > + scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE
> > > (vectype),
> > > > +                            vec_lhs_phi, bitstart);
> > >
> > > So bitstart is always zero?  I wonder why using CFN_VEC_EXTRACT over
> > > BIT_FIELD_REF here which wouldn't need any additional target support.
> > >
> >
> > Ok, how about...
> >
> > ---
> >
> > I was generating the vector reverse mask without checking if the target
> > actually supported such an operation.
> >
> > This patch changes it to if the bitstart is 0 then use BIT_FIELD_REF instead
> > to extract the first element since this is supported by all targets.
> >
> > This is good for now since masks always come from whilelo.  But in the future
> > when masks can come from other sources we will need the old code back.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues with --enable-checking=release --enable-lto
> > --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> > tested on cross cc1 for amdgcn-amdhsa and issue fixed.
> >
> > Ok for master?
>
> OK.
>
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >       PR tree-optimization/113199
> >       * tree-vect-loop.cc (vectorizable_live_operation_1): Use
> >       BIT_FIELD_REF.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       PR tree-optimization/113199
> >       * gcc.target/gcn/pr113199.c: New test.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/testsuite/gcc.target/gcn/pr113199.c b/gcc/testsuite/gcc.target/gcn/pr113199.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..8a641e5536e80e207ca0163cac66c0f4f6ca93f7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/gcn/pr113199.c
> > @@ -0,0 +1,44 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O2" } */
> > +
> > +typedef long unsigned int size_t;
> > +typedef int wchar_t;
> > +struct tm
> > +{
> > +  int tm_mon;
> > +  int tm_year;
> > +};
> > +int abs (int);
> > +struct lc_time_T { const char *month[12]; };
> > +struct __locale_t * __get_current_locale (void) { }
> > +const struct lc_time_T * __get_time_locale (struct __locale_t *locale) { }
> > +const wchar_t * __ctloc (wchar_t *buf, const char *elem, size_t *len_ret) { return buf; }
> > +size_t
> > +__strftime (wchar_t *s, size_t maxsize, const wchar_t *format,
> > +     const struct tm *tim_p, struct __locale_t *locale)
> > +{
> > +  size_t count = 0;
> > +  const wchar_t *ctloc;
> > +  wchar_t ctlocbuf[256];
> > +  size_t i, ctloclen;
> > +  const struct lc_time_T *_CurrentTimeLocale = __get_time_locale (locale);
> > +    {
> > +      switch (*format)
> > + {
> > + case L'B':
> > +   (ctloc = __ctloc (ctlocbuf, _CurrentTimeLocale->month[tim_p->tm_mon], &ctloclen));
> > +   for (i = 0; i < ctloclen; i++)
> > +     {
> > +       if (count < maxsize - 1)
> > +  s[count++] = ctloc[i];
> > +       else
> > +  return 0;
> > +       {
> > +  int century = tim_p->tm_year >= 0
> > +    ? tim_p->tm_year / 100 + 1900 / 100
> > +    : abs (tim_p->tm_year + 1900) / 100;
> > +       }
> > +   }
> > + }
> > +    }
> > +}
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 37f1be1101ffae779214056a0886411e0683e887..39b1161309d8ff8bfe88ee26df9147df0af0a58c 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -10592,7 +10592,17 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
> >
> >    gimple_seq stmts = NULL;
> >    tree new_tree;
> > -  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > +
> > +  /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
> > +  if (integer_zerop (bitstart))
> > +    {
> > +      tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
> > +                                vec_lhs_phi, bitsize, bitstart);
> > +
> > +      /* Convert the extracted vector element to the scalar type.  */
> > +      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > +    }
> > +  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> >      {
> >        /* Emit:
> >
> > @@ -10618,12 +10628,6 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
> >        tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> >                                    len, bias_minus_one);
> >
> > -      /* This needs to implement extraction of the first index, but not sure
> > -      how the LEN stuff works.  At the moment we shouldn't get here since
> > -      there's no LEN support for early breaks.  But guard this so there's
> > -      no incorrect codegen.  */
> > -      gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> > -
> >        /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> >        tree scalar_res
> >       = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> > @@ -10648,32 +10652,6 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
> >                                     &LOOP_VINFO_MASKS (loop_vinfo),
> >                                     1, vectype, 0);
> >        tree scalar_res;
> > -
> > -      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> > -      instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> > -      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))

This breaks bootstrap:

https://gcc.gnu.org/pipermail/gcc-regression/2024-January/078955.html

../../src-master/gcc/tree-vect-loop.cc: In function ‘tree_node*
vectorizable_live_operation_1(loop_vec_info, stmt_vec_info,
basic_block, tree, int, slp_tree, tree, tree, tree, tree, bool,
gimple_stmt_iterator*)’:
../../src-master/gcc/tree-vect-loop.cc:10598:52: error: unused
parameter ‘restart_loop’ [-Werror=unused-parameter]
10598 |                                tree lhs_type, bool restart_loop,
      |

> > -     {
> > -       /* First create the permuted mask.  */
> > -       tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> > -       tree perm_dest = copy_ssa_name (mask);
> > -       gimple *perm_stmt
> > -             = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> > -                                    mask, perm_mask);
> > -       vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> > -                                    &gsi);
> > -       mask = perm_dest;
> > -
> > -       /* Then permute the vector contents.  */
> > -       tree perm_elem = perm_mask_for_reverse (vectype);
> > -       perm_dest = copy_ssa_name (vec_lhs_phi);
> > -       perm_stmt
> > -             = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> > -                                    vec_lhs_phi, perm_elem);
> > -       vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> > -                                    &gsi);
> > -       vec_lhs_phi = perm_dest;
> > -     }
> > -
> >        gimple_seq_add_seq (&stmts, tem);
> >
> >        scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> >
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
Rainer Orth Jan. 9, 2024, 4:06 p.m. UTC | #6
Richard Biener <rguenther@suse.de> writes:

> On Tue, 9 Jan 2024, Tamar Christina wrote:
>
>> > > -
>> > > -      gimple_seq_add_seq (&stmts, tem);
>> > > -
>> > > -      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
>> > > -				 mask, vec_lhs_phi);
>> > > +	scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE
>> > (vectype),
>> > > +				   vec_lhs_phi, bitstart);
>> > 
>> > So bitstart is always zero?  I wonder why using CFN_VEC_EXTRACT over
>> > BIT_FIELD_REF here which wouldn't need any additional target support.
>> > 
>> 
>> Ok, how about...
>> 
>> ---
>> 
>> I was generating the vector reverse mask without checking if the target
>> actually supported such an operation.
>> 
>> This patch changes it to if the bitstart is 0 then use BIT_FIELD_REF instead
>> to extract the first element since this is supported by all targets.
>> 
>> This is good for now since masks always come from whilelo.  But in the future
>> when masks can come from other sources we will need the old code back.
>> 
>> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
>> and no issues with --enable-checking=release --enable-lto
>> --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
>> tested on cross cc1 for amdgcn-amdhsa and issue fixed.
>> 
>> Ok for master?
>
> OK.
>
>> Thanks,
>> Tamar
>> 
>> gcc/ChangeLog:
>> 
>> 	PR tree-optimization/113199
>> 	* tree-vect-loop.cc (vectorizable_live_operation_1): Use
>> 	BIT_FIELD_REF.

This patch broke bootstrap (everywhere, it seems; seen on
i386-pc-solaris2.11 and sparc-sun-solaris2.11):

/vol/gcc/src/hg/master/local/gcc/tree-vect-loop.cc: In function 'tree_node* vectorizable_live_operation_1(loop_vec_info, stmt_vec_info, basic_block, tree, int, slp_tree, tree, tree, tree, tree, bool, gimple_stmt_iterator*)':
/vol/gcc/src/hg/master/local/gcc/tree-vect-loop.cc:10598:52: error: unused parameter 'restart_loop' [-Werror=unused-parameter]
10598 |                                tree lhs_type, bool restart_loop,
      |                                               ~~~~~^~~~~~~~~~~~

	Rainer
Tamar Christina Jan. 9, 2024, 4:09 p.m. UTC | #7
Hmm I'm confused as to why It didn't break mine.. just did one again.. anyway I'll remove the unused variable.

> -----Original Message-----
> From: Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE>
> Sent: Tuesday, January 9, 2024 4:06 PM
> To: Richard Biener <rguenther@suse.de>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; nd
> <nd@arm.com>; jlaw@ventanamicro.com
> Subject: Re: [PATCH]middle-end: check if target can do extract first for early breaks
> [PR113199]
> 
> Richard Biener <rguenther@suse.de> writes:
> 
> > On Tue, 9 Jan 2024, Tamar Christina wrote:
> >
> >> > > -
> >> > > -      gimple_seq_add_seq (&stmts, tem);
> >> > > -
> >> > > -      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> >> > > -				 mask, vec_lhs_phi);
> >> > > +	scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE
> >> > (vectype),
> >> > > +				   vec_lhs_phi, bitstart);
> >> >
> >> > So bitstart is always zero?  I wonder why using CFN_VEC_EXTRACT over
> >> > BIT_FIELD_REF here which wouldn't need any additional target support.
> >> >
> >>
> >> Ok, how about...
> >>
> >> ---
> >>
> >> I was generating the vector reverse mask without checking if the target
> >> actually supported such an operation.
> >>
> >> This patch changes it to if the bitstart is 0 then use BIT_FIELD_REF instead
> >> to extract the first element since this is supported by all targets.
> >>
> >> This is good for now since masks always come from whilelo.  But in the future
> >> when masks can come from other sources we will need the old code back.
> >>
> >> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> >> and no issues with --enable-checking=release --enable-lto
> >> --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> >> tested on cross cc1 for amdgcn-amdhsa and issue fixed.
> >>
> >> Ok for master?
> >
> > OK.
> >
> >> Thanks,
> >> Tamar
> >>
> >> gcc/ChangeLog:
> >>
> >> 	PR tree-optimization/113199
> >> 	* tree-vect-loop.cc (vectorizable_live_operation_1): Use
> >> 	BIT_FIELD_REF.
> 
> This patch broke bootstrap (everywhere, it seems; seen on
> i386-pc-solaris2.11 and sparc-sun-solaris2.11):
> 
> /vol/gcc/src/hg/master/local/gcc/tree-vect-loop.cc: In function 'tree_node*
> vectorizable_live_operation_1(loop_vec_info, stmt_vec_info, basic_block, tree, int,
> slp_tree, tree, tree, tree, tree, bool, gimple_stmt_iterator*)':
> /vol/gcc/src/hg/master/local/gcc/tree-vect-loop.cc:10598:52: error: unused
> parameter 'restart_loop' [-Werror=unused-parameter]
> 10598 |                                tree lhs_type, bool restart_loop,
>       |                                               ~~~~~^~~~~~~~~~~~
> 
> 	Rainer
> 
> --
> -----------------------------------------------------------------------------
> Rainer Orth, Center for Biotechnology, Bielefeld University
H.J. Lu Jan. 9, 2024, 4:12 p.m. UTC | #8
On Tue, Jan 9, 2024 at 8:10 AM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> Hmm I'm confused as to why It didn't break mine.. just did one again.. anyway I'll remove the unused variable.

Can you also make vectorizable_live_operation_1 static?

> > -----Original Message-----
> > From: Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE>
> > Sent: Tuesday, January 9, 2024 4:06 PM
> > To: Richard Biener <rguenther@suse.de>
> > Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org; nd
> > <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: Re: [PATCH]middle-end: check if target can do extract first for early breaks
> > [PR113199]
> >
> > Richard Biener <rguenther@suse.de> writes:
> >
> > > On Tue, 9 Jan 2024, Tamar Christina wrote:
> > >
> > >> > > -
> > >> > > -      gimple_seq_add_seq (&stmts, tem);
> > >> > > -
> > >> > > -      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > >> > > -                               mask, vec_lhs_phi);
> > >> > > +      scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE
> > >> > (vectype),
> > >> > > +                                 vec_lhs_phi, bitstart);
> > >> >
> > >> > So bitstart is always zero?  I wonder why using CFN_VEC_EXTRACT over
> > >> > BIT_FIELD_REF here which wouldn't need any additional target support.
> > >> >
> > >>
> > >> Ok, how about...
> > >>
> > >> ---
> > >>
> > >> I was generating the vector reverse mask without checking if the target
> > >> actually supported such an operation.
> > >>
> > >> This patch changes it to if the bitstart is 0 then use BIT_FIELD_REF instead
> > >> to extract the first element since this is supported by all targets.
> > >>
> > >> This is good for now since masks always come from whilelo.  But in the future
> > >> when masks can come from other sources we will need the old code back.
> > >>
> > >> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > >> and no issues with --enable-checking=release --enable-lto
> > >> --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> > >> tested on cross cc1 for amdgcn-amdhsa and issue fixed.
> > >>
> > >> Ok for master?
> > >
> > > OK.
> > >
> > >> Thanks,
> > >> Tamar
> > >>
> > >> gcc/ChangeLog:
> > >>
> > >>    PR tree-optimization/113199
> > >>    * tree-vect-loop.cc (vectorizable_live_operation_1): Use
> > >>    BIT_FIELD_REF.
> >
> > This patch broke bootstrap (everywhere, it seems; seen on
> > i386-pc-solaris2.11 and sparc-sun-solaris2.11):
> >
> > /vol/gcc/src/hg/master/local/gcc/tree-vect-loop.cc: In function 'tree_node*
> > vectorizable_live_operation_1(loop_vec_info, stmt_vec_info, basic_block, tree, int,
> > slp_tree, tree, tree, tree, tree, bool, gimple_stmt_iterator*)':
> > /vol/gcc/src/hg/master/local/gcc/tree-vect-loop.cc:10598:52: error: unused
> > parameter 'restart_loop' [-Werror=unused-parameter]
> > 10598 |                                tree lhs_type, bool restart_loop,
> >       |                                               ~~~~~^~~~~~~~~~~~
> >
> >       Rainer
> >
> > --
> > -----------------------------------------------------------------------------
> > Rainer Orth, Center for Biotechnology, Bielefeld University
diff mbox series

Patch

--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/pr113199.c
@@ -0,0 +1,44 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+
+typedef long unsigned int size_t;
+typedef int wchar_t;
+struct tm
+{
+  int tm_mon;
+  int tm_year;
+};
+int abs (int);
+struct lc_time_T { const char *month[12]; };
+struct __locale_t * __get_current_locale (void) { }
+const struct lc_time_T * __get_time_locale (struct __locale_t *locale) { }
+const wchar_t * __ctloc (wchar_t *buf, const char *elem, size_t *len_ret) { return buf; }
+size_t
+__strftime (wchar_t *s, size_t maxsize, const wchar_t *format,
+     const struct tm *tim_p, struct __locale_t *locale)
+{
+  size_t count = 0;
+  const wchar_t *ctloc;
+  wchar_t ctlocbuf[256];
+  size_t i, ctloclen;
+  const struct lc_time_T *_CurrentTimeLocale = __get_time_locale (locale);
+    {
+      switch (*format)
+ {
+ case L'B':
+   (ctloc = __ctloc (ctlocbuf, _CurrentTimeLocale->month[tim_p->tm_mon], &ctloclen));
+   for (i = 0; i < ctloclen; i++)
+     {
+       if (count < maxsize - 1)
+  s[count++] = ctloc[i];
+       else
+  return 0;
+       {
+  int century = tim_p->tm_year >= 0
+    ? tim_p->tm_year / 100 + 1900 / 100
+    : abs (tim_p->tm_year + 1900) / 100;
+       }
+   }
+ }
+    }
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 37f1be1101ffae779214056a0886411e0683e887..5aa92e67444e7aacf458fffa1428f1983c482374 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10648,36 +10648,18 @@  vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
 				      &LOOP_VINFO_MASKS (loop_vinfo),
 				      1, vectype, 0);
       tree scalar_res;
+      gimple_seq_add_seq (&stmts, tem);
 
       /* For an inverted control flow with early breaks we want EXTRACT_FIRST
-	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+	 instead of EXTRACT_LAST.  For now since the mask always comes from a
+	 WHILELO we can get the first element ignoring the mask since CLZ of the
+	 mask will always be zero.  */
       if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
-	{
-	  /* First create the permuted mask.  */
-	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
-	  tree perm_dest = copy_ssa_name (mask);
-	  gimple *perm_stmt
-		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
-				       mask, perm_mask);
-	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
-				       &gsi);
-	  mask = perm_dest;
-
-	  /* Then permute the vector contents.  */
-	  tree perm_elem = perm_mask_for_reverse (vectype);
-	  perm_dest = copy_ssa_name (vec_lhs_phi);
-	  perm_stmt
-		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
-				       vec_lhs_phi, perm_elem);
-	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
-				       &gsi);
-	  vec_lhs_phi = perm_dest;
-	}
-
-      gimple_seq_add_seq (&stmts, tem);
-
-      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-				 mask, vec_lhs_phi);
+	scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+				   vec_lhs_phi, bitstart);
+      else
+	scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				   mask, vec_lhs_phi);
 
       /* Convert the extracted vector element to the scalar type.  */
       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
@@ -10852,9 +10834,25 @@  vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	      gcc_assert (ncopies == 1 && !slp_node);
 	      if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
 						  OPTIMIZE_FOR_SPEED))
-		vect_record_loop_mask (loop_vinfo,
-				       &LOOP_VINFO_MASKS (loop_vinfo),
-				       1, vectype, NULL);
+		{
+		  if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
+		      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+		      && !direct_internal_fn_supported_p (IFN_EXTRACT_LAST,
+							  vectype,
+							  OPTIMIZE_FOR_SPEED))
+		    {
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				"can't operate on partial vectors "
+				"because the target doesn't support extract "
+				"first reduction.\n");
+		      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+		    }
+		  else
+		    vect_record_loop_mask (loop_vinfo,
+					   &LOOP_VINFO_MASKS (loop_vinfo),
+					   1, vectype, NULL);
+		}
 	      else if (can_vec_extract_var_idx_p (
 			 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
 		vect_record_loop_len (loop_vinfo,