diff mbox

[vec-tails] Support loop epilogue vectorization

Message ID alpine.LSU.2.11.1612011110070.5294@t29.fhfr.qr
State New
Headers show

Commit Message

Richard Biener Dec. 1, 2016, 11:33 a.m. UTC
On Mon, 28 Nov 2016, Yuri Rumyantsev wrote:

> Richard!
> 
> I attached vect dump for hte part of attached test-case which
> illustrated how vectorization of epilogues works through masking:
> #define SIZE 1023
> #define ALIGN 64
> 
> extern int posix_memalign(void **memptr, __SIZE_TYPE__ alignment,
> __SIZE_TYPE__ size) __attribute__((weak));
> extern void free (void *);
> 
> void __attribute__((noinline))
> test_citer (int * __restrict__ a,
>    int * __restrict__ b,
>    int * __restrict__ c)
> {
>   int i;
> 
>   a = (int *)__builtin_assume_aligned (a, ALIGN);
>   b = (int *)__builtin_assume_aligned (b, ALIGN);
>   c = (int *)__builtin_assume_aligned (c, ALIGN);
> 
>   for (i = 0; i < SIZE; i++)
>     c[i] = a[i] + b[i];
> }
> 
> It was compiled with -mavx2 --param vect-epilogues-mask=1 options.
> 
> I did not include in this patch vectorization of low trip-count loops
> since in the original patch additional parameter was introduced:
> +DEFPARAM (PARAM_VECT_SHORT_LOOPS,
> +  "vect-short-loops",
> +  "Enable vectorization of low trip count loops using masking.",
> +  0, 0, 1)
> 
> I assume that this ability can be included very quickly but it
> requires cost model enhancements also.

Comments on the patch itself (as I'm having a closer look again,
I know how it vectorizes the above but I wondered why epilogue
and short-trip loops are not basically the same code path).

Btw, I don't like that the features are behind a --param paywall.
That just means a) nobody will use it, b) it will bit-rot quickly,
c) bugs are well-hidden.

+  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
+      && integer_zerop (nested_in_vect_loop
+                       ? STMT_VINFO_DR_STEP (stmt_info)
+                       : DR_STEP (dr)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "allow invariant load for masked loop.\n");
+    }

this can test memory_access_type == VMAT_INVARIANT.  Please put
all the checks in a common

  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
    {
       if (memory_access_type == VMAT_INVARIANT)
         {
         }
       else if (...)
         {
            LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
         }
       else if (..)
...
    }

@@ -6667,6 +6756,15 @@ vectorizable_load (gimple *stmt, 
gimple_stmt_iterator *gsi, gimple **vec_stmt,
       gcc_assert (!nested_in_vect_loop);
       gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));

+      if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "cannot be masked: grouped access is not"
+                            " supported.");
+         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
+      }
+

isn't this already handled by the above?  Or rather the general
disallowance of SLP?

@@ -5730,6 +5792,24 @@ vectorizable_store (gimple *stmt, 
gimple_stmt_iterator *gsi, gimple **vec_stmt,
                            &memory_access_type, &gs_info))
     return false;

+  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
+      && memory_access_type != VMAT_CONTIGUOUS)
+    {
+      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "cannot be masked: unsupported memory access 
type.\n");
+    }
+
+  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
+      && !can_mask_load_store (stmt))
+    {
+      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "cannot be masked: unsupported masked store.\n");
+    }
+

likewise please combine the ifs.

@@ -2354,7 +2401,10 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
                                          ptr, vec_mask, vec_rhs);
          vect_finish_stmt_generation (stmt, new_stmt, gsi);
          if (i == 0)
-           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+           {
+             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+             STMT_VINFO_FIRST_COPY_P (vinfo_for_stmt (new_stmt)) = true;
+           }
          else
            STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
          prev_stmt_info = vinfo_for_stmt (new_stmt);

here you only set the flag, elsewhere you copy DR and VECTYPE as well.

@@ -2113,6 +2146,20 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
               && !useless_type_conversion_p (vectype, rhs_vectype)))
     return false;

+  if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+    {
+      /* Check that mask conjuction is supported.  */
+      optab tab;
+      tab = optab_for_tree_code (BIT_AND_EXPR, vectype, optab_default);
+      if (!tab || optab_handler (tab, TYPE_MODE (vectype)) == 
CODE_FOR_nothing)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "cannot be masked: unsupported mask 
operation\n");
+         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
+       }
+    }

does this really test whether we can bit-and the mask?  You are
using the vector type of the store (which might be V2DF for example),
also for AVX512 it might be a vector-bool type with integer mode?
Of course we maybe can simply assume mask conjunction is available
(I know no ISA where that would be not true).

+/* Return true if STMT can be converted to masked form.  */
+
+static bool
+can_mask_load_store (gimple *stmt)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree vectype, mask_vectype;
+  tree lhs, ref;
+
+  if (!stmt_info)
+    return false;
+  lhs = gimple_assign_lhs (stmt);
+  ref = (TREE_CODE (lhs) == SSA_NAME) ? gimple_assign_rhs1 (stmt) : lhs;
+  if (may_be_nonaddressable_p (ref))
+    return false;
+  vectype = STMT_VINFO_VECTYPE (stmt_info);

You probably modeled this after ifcvt_can_use_mask_load_store but I
don't think checking may_be_nonaddressable_p is necessary (we couldn't
even vectorize such refs).  stmt_info should never be NULL either.
With the check removed tree-ssa-loop-ivopts.h should no longer be
necessary.

+static void
+vect_mask_load_store_stmt (gimple *stmt, tree vectype, tree mask,
+                          data_reference *dr, gimple_stmt_iterator *si)
+{
...
+  addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (mem),
+                                  true, NULL_TREE, true,
+                                  GSI_SAME_STMT);
+
+  align = TYPE_ALIGN_UNIT (vectype);
+  if (aligned_access_p (dr))
+    misalign = 0;
+  else if (DR_MISALIGNMENT (dr) == -1)
+    {
+      align = TYPE_ALIGN_UNIT (elem_type);
+      misalign = 0;
+    }
+  else
+    misalign = DR_MISALIGNMENT (dr);
+  set_ptr_info_alignment (get_ptr_info (addr), align, misalign);
+  ptr = build_int_cst (reference_alias_ptr_type (mem),
+                      misalign ? misalign & -misalign : align);

you should simply use

  align = get_object_alignment (mem) / BITS_PER_UNIT;

here rather than trying to be clever.  Eventually you don't need
the DR then (see question above).

+    }
+  gsi_replace (si ? si : &gsi, new_stmt, false);

when you replace the load/store please previously copy VUSE and VDEF
from the original one (we were nearly clean enough to no longer
require a virtual operand rewrite after vectorization...)  Thus

  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
  gimple_set_vdef (new_stmt, gimple_vdef (stmt));

+static void
+vect_mask_loop (loop_vec_info loop_vinfo)
+{
...
+  /* Scan all loop statements to convert vector load/store including 
masked
+     form.  */
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    {
+      basic_block bb = bbs[i];
+      for (gimple_stmt_iterator si = gsi_start_bb (bb);
+          !gsi_end_p (si); gsi_next (&si))
+       {
+         gimple *stmt = gsi_stmt (si);
+         stmt_vec_info stmt_info = NULL;
+         tree vectype = NULL;
+         data_reference *dr;
+
+         /* Mask load case.  */
+         if (is_gimple_call (stmt)
+             && gimple_call_internal_p (stmt)
+             && gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+             && !VECTOR_TYPE_P (TREE_TYPE (gimple_call_arg (stmt, 2))))
+           {
...
+             /* Skip invariant loads.  */
+             if (integer_zerop (nested_in_vect_loop_p (loop, stmt)
+                                ? STMT_VINFO_DR_STEP (stmt_info)
+                                : DR_STEP (STMT_VINFO_DATA_REF 
(stmt_info))))
+               continue;

seeing this it would be nice if stmt_info had a flag for whether
the stmt needs masking (and a flag on wheter this is a scalar or a
vectorized stmt).

+         /* Skip hoisted out statements.  */
+         if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
+           continue;

err, you walk stmts in the loop!  Isn't this covered by the above
skipping of 'invariant loads'?

+static gimple *
+vect_mask_reduction_stmt (gimple *stmt, tree mask, gimple *prev)
+{

depending on the reduction operand there are variants that
could get away w/o the VEC_COND_EXPR, like

  S1': tem_4 = d_3 & MASK;
  S2': r_1 = r_2 + tem_4;

which works for plus at least.  More generally doing

  S1': tem_4 = VEC_COND_EXPR<MASK, d_3, neutral operand>
  S2': r_1 = r_2 OP tem_4;

and leaving optimization to & to later opts (& won't work for
AVX512 mask registers I guess).

Good enough for later enhacement of course.

+static void
+vect_gen_ivs_for_masking (loop_vec_info loop_vinfo, vec<tree> *ivs)
+{
...

isn't it enough to always create a single IV and derive the
additional copies by IV + i * { elems, elems, elems ... }?
IVs are expensive -- I'm sure we can optimize the rest of the
scheme further as well but this one looks obvious to me.

@@ -3225,12 +3508,32 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);

+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    {
+      /* Currently we don't produce scalar epilogue version in case
+        its masked version is provided.  It means we don't need to
+        compute profitability one more time here.  Just make a
+        masked loop version.  */
+      if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
+         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "cost model: mask loop epilogue.\n");
+         LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
+         *ret_min_profitable_niters = 0;
+         *ret_min_profitable_estimate = 0;
+         return;
+       }
+    }
   /* Cost model disabled.  */
-  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
+  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
     {
       dump_printf_loc (MSG_NOTE, vect_location, "cost model 
disabled.\n");
       *ret_min_profitable_niters = 0;
       *ret_min_profitable_estimate = 0;
+      if (PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK)
+         && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
       return;
     }

the unlimited_cost_model case should come first?  OTOH masking or
not is probably not sth covered by 'unlimited' - that is about
vectorizing or not.  But the above code means that for
epilogue vectorization w/o masking we ignore unlimited_cost_model ()?
That doesn't make sense to me.

Plus if this is short-trip or epilogue vectorization and the
cost model is _not_ unlimited then we dont' want to enable
masking always (if it is possible).  It might be we statically
know the epilogue executes for at most two iterations for example.

I don't see _any_ cost model for vectorizing the epilogue with
masking?  Am I missing something?  A "trivial" cost model
should at least consider the additional IV(s), the mask
compute and the widening and narrowing ops required.


I think the prolog_peeling was fixed during the epilogue vectorization 
review and should no longer be necessary.  Please add
a && ! LOOP_VINFO_MASK_LOOP () to the epilog_peeling init instead
(it should also work for short-trip loop vectorization).

@@ -2022,11 +2291,18 @@ start_over:
       || (max_niter != -1
          && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
     {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: iteration count smaller than "
-                        "vectorization factor.\n");
-      return false;
+      /* Allow low trip count for loop epilogue we want to mask.  */
+      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
+       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
+      else
+       {
+         if (dump_enabled_p ())

so why do we test only LOOP_VINFO_EPILOGUE_P here?  All the code
I saw sofar would also work for the main loop (but the cost
model is missing).

I am missing testcases.  There's only a single one but we should
have cases covering all kinds of mask IV widths and widen/shorten
masks.

Do you have any numbers on SPEC 2k6 with epilogue vect and/or masking
enabled for an AVX2 machine?

Oh, and I really dislike the --param paywall.

Thanks,
Richard.

> Best regards.
> Yuri.
> 
> 
> 2016-11-28 17:39 GMT+03:00 Richard Biener <rguenther@suse.de>:
> > On Thu, 24 Nov 2016, Yuri Rumyantsev wrote:
> >
> >> Hi All,
> >>
> >> Here is the second patch which supports epilogue vectorization using
> >> masking without cost model. Currently it is possible
> >> only with passing parameter "--param vect-epilogues-mask=1".
> >>
> >> Bootstrapping and regression testing did not show any new regression.
> >>
> >> Any comments will be appreciated.
> >
> > Going over the patch the main question is one how it works -- it looks
> > like the decision whether to vectorize & mask the epilogue is made
> > when vectorizing the loop that generates the epilogue rather than
> > in the epilogue vectorization path?
> >
> > That is, I'd have expected to see this handling low-trip count loops
> > by masking?  And thus masking the epilogue simply by it being
> > low-trip count?
> >
> > Richard.
> >
> >> ChangeLog:
> >> 2016-11-24  Yuri Rumyantsev  <ysrumyan@gmail.com>
> >>
> >> * params.def (PARAM_VECT_EPILOGUES_MASK): New.
> >> * tree-vect-data-refs.c (vect_get_new_ssa_name): Support vect_mask_var.
> >> * tree-vect-loop.c: Include insn-config.h, recog.h and alias.h.
> >> (new_loop_vec_info): Add zeroing can_be_masked, mask_loop and
> >> required_mask fields.
> >> (vect_check_required_masks_widening): New.
> >> (vect_check_required_masks_narrowing): New.
> >> (vect_get_masking_iv_elems): New.
> >> (vect_get_masking_iv_type): New.
> >> (vect_get_extreme_masks): New.
> >> (vect_check_required_masks): New.
> >> (vect_analyze_loop_operations): Call vect_check_required_masks if all
> >> statements can be masked.
> >> (vect_analyze_loop_2): Inititalize to zero min_scalar_loop_bound.
> >> Add check that epilogue can be masked with the same vf with issue
> >> fail notes.  Allow epilogue vectorization through masking of low trip
> >> loops. Set to true can_be_masked field before loop operation analysis.
> >> Do not set-up min_scalar_loop_bound for epilogue vectorization through
> >> masking.  Do not peeling for epilogue masking.  Reset can_be_masked
> >> field before repeat analysis.
> >> (vect_estimate_min_profitable_iters): Do not compute profitability
> >> for epilogue masking.  Set up mask_loop filed to true if parameter
> >> PARAM_VECT_EPILOGUES_MASK is non-zero.
> >> (vectorizable_reduction): Add check that statement can be masked.
> >> (vectorizable_induction): Do not support masking for induction.
> >> (vect_gen_ivs_for_masking): New.
> >> (vect_get_mask_index_for_elems): New.
> >> (vect_get_mask_index_for_type): New.
> >> (vect_create_narrowed_masks): New.
> >> (vect_create_widened_masks): New.
> >> (vect_gen_loop_masks): New.
> >> (vect_mask_reduction_stmt): New.
> >> (vect_mask_mask_load_store_stmt): New.
> >> (vect_mask_load_store_stmt): New.
> >> (vect_mask_loop): New.
> >> (vect_transform_loop): Invoke vect_mask_loop if required.
> >> Use div_ceil to recompute upper bounds for masked loops.  Issue
> >> statistics for epilogue vectorization through masking. Do not reduce
> >> vf for masking epilogue.
> >> * tree-vect-stmts.c: Include tree-ssa-loop-ivopts.h.
> >> (can_mask_load_store): New.
> >> (vectorizable_mask_load_store): Check that mask conjuction is
> >> supported.  Set-up first_copy_p field of stmt_vinfo.
> >> (vectorizable_simd_clone_call): Check that simd clone can not be
> >> masked.
> >> (vectorizable_store): Check that store can be masked. Mark the first
> >> copy of generated vector stores and provide it with vectype and the
> >> original data reference.
> >> (vectorizable_load): Check that load can be masked.
> >> (vect_stmt_should_be_masked_for_epilogue): New.
> >> (vect_add_required_mask_for_stmt): New.
> >> (vect_analyze_stmt): Add check on unsupported statements for masking
> >> with printing message.
> >> * tree-vectorizer.h (struct _loop_vec_info): Add new fields
> >> can_be_maske, required_masks, masl_loop.
> >> (LOOP_VINFO_CAN_BE_MASKED): New.
> >> (LOOP_VINFO_REQUIRED_MASKS): New.
> >> (LOOP_VINFO_MASK_LOOP): New.
> >> (struct _stmt_vec_info): Add first_copy_p field.
> >> (STMT_VINFO_FIRST_COPY_P): New.
> >>
> >> gcc/testsuite/
> >>
> >> * gcc.dg/vect/vect-tail-mask-1.c: New test.
> >>
> >> 2016-11-18 18:54 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
> >> > On 18 November 2016 at 16:46, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> >> >> It is very strange that this test failed on arm, since it requires
> >> >> target avx2 to check vectorizer dumps:
> >> >>
> >> >> /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" {
> >> >> target avx2_runtime } } } */
> >> >> /* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED
> >> >> \\(VS=16\\)" 2 "vect" { target avx2_runtime } } } */
> >> >>
> >> >> Could you please clarify what is the reason of the failure?
> >> >
> >> > It's not the scan-dumps that fail, but the execution.
> >> > The test calls abort() for some reason.
> >> >
> >> > It will take me a while to rebuild the test manually in the right
> >> > debug environment to provide you with more traces.
> >> >
> >> >
> >> >
> >> >>
> >> >> Thanks.
> >> >>
> >> >> 2016-11-18 16:20 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
> >> >>> On 15 November 2016 at 15:41, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> >> >>>> Hi All,
> >> >>>>
> >> >>>> Here is patch for non-masked epilogue vectoriziation.
> >> >>>>
> >> >>>> Bootstrap and regression testing did not show any new failures.
> >> >>>>
> >> >>>> Is it OK for trunk?
> >> >>>>
> >> >>>> Thanks.
> >> >>>> Changelog:
> >> >>>>
> >> >>>> 2016-11-15  Yuri Rumyantsev  <ysrumyan@gmail.com>
> >> >>>>
> >> >>>> * params.def (PARAM_VECT_EPILOGUES_NOMASK): New.
> >> >>>> * tree-if-conv.c (tree_if_conversion): Make public.
> >> >>>> * * tree-if-conv.h: New file.
> >> >>>> * tree-vect-data-refs.c (vect_analyze_data_ref_dependences) Avoid
> >> >>>> dynamic alias checks for epilogues.
> >> >>>> * tree-vect-loop-manip.c (vect_do_peeling): Return created epilog.
> >> >>>> * tree-vect-loop.c: include tree-if-conv.h.
> >> >>>> (new_loop_vec_info): Add zeroing orig_loop_info field.
> >> >>>> (vect_analyze_loop_2): Don't try to enhance alignment for epilogues.
> >> >>>> (vect_analyze_loop): Add argument ORIG_LOOP_INFO which is not NULL
> >> >>>> if epilogue is vectorized, set up orig_loop_info field of loop_vinfo
> >> >>>> using passed argument.
> >> >>>> (vect_transform_loop): Check if created epilogue should be returned
> >> >>>> for further vectorization with less vf.  If-convert epilogue if
> >> >>>> required. Print vectorization success for epilogue.
> >> >>>> * tree-vectorizer.c (vectorize_loops): Add epilogue vectorization
> >> >>>> if it is required, pass loop_vinfo produced during vectorization of
> >> >>>> loop body to vect_analyze_loop.
> >> >>>> * tree-vectorizer.h (struct _loop_vec_info): Add new field
> >> >>>> orig_loop_info.
> >> >>>> (LOOP_VINFO_ORIG_LOOP_INFO): New.
> >> >>>> (LOOP_VINFO_EPILOGUE_P): New.
> >> >>>> (LOOP_VINFO_ORIG_VECT_FACTOR): New.
> >> >>>> (vect_do_peeling): Change prototype to return epilogue.
> >> >>>> (vect_analyze_loop): Add argument of loop_vec_info type.
> >> >>>> (vect_transform_loop): Return created loop.
> >> >>>>
> >> >>>> gcc/testsuite/
> >> >>>>
> >> >>>> * lib/target-supports.exp (check_avx2_hw_available): New.
> >> >>>> (check_effective_target_avx2_runtime): New.
> >> >>>> * gcc.dg/vect/vect-tail-nomask-1.c: New test.
> >> >>>>
> >> >>>
> >> >>> Hi,
> >> >>>
> >> >>> This new test fails on arm-none-eabi (using default cpu/fpu/mode):
> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c -flto -ffat-lto-objects execution test
> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c execution test
> >> >>>
> >> >>> It does pass on the same target if configured --with-cpu=cortex-a9.
> >> >>>
> >> >>> Christophe
> >> >>>
> >> >>>
> >> >>>
> >> >>>>
> >> >>>> 2016-11-14 20:04 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> >>>>> On November 14, 2016 4:39:40 PM GMT+01:00, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> >> >>>>>>Richard,
> >> >>>>>>
> >> >>>>>>I checked one of the tests designed for epilogue vectorization using
> >> >>>>>>patches 1 - 3 and found out that build compiler performs vectorization
> >> >>>>>>of epilogues with --param vect-epilogues-nomask=1 passed:
> >> >>>>>>
> >> >>>>>>$ gcc -Ofast -mavx2 t1.c -S --param vect-epilogues-nomask=1 -o
> >> >>>>>>t1.new-nomask.s -fdump-tree-vect-details
> >> >>>>>>$ grep VECTORIZED -c t1.c.156t.vect
> >> >>>>>>4
> >> >>>>>> Without param only 2 loops are vectorized.
> >> >>>>>>
> >> >>>>>>Should I simply add a part of tests related to this feature or I must
> >> >>>>>>delete all not necessary changes also?
> >> >>>>>
> >> >>>>> Please remove all not necessary changes.
> >> >>>>>
> >> >>>>> Richard.
> >> >>>>>
> >> >>>>>>Thanks.
> >> >>>>>>Yuri.
> >> >>>>>>
> >> >>>>>>2016-11-14 16:40 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> >>>>>>> On Mon, 14 Nov 2016, Yuri Rumyantsev wrote:
> >> >>>>>>>
> >> >>>>>>>> Richard,
> >> >>>>>>>>
> >> >>>>>>>> In my previous patch I forgot to remove couple lines related to aux
> >> >>>>>>field.
> >> >>>>>>>> Here is the correct updated patch.
> >> >>>>>>>
> >> >>>>>>> Yeah, I noticed.  This patch would be ok for trunk (together with
> >> >>>>>>> necessary parts from 1 and 2) if all not required parts are removed
> >> >>>>>>> (and you'd add the testcases covering non-masked tail vect).
> >> >>>>>>>
> >> >>>>>>> Thus, can you please produce a single complete patch containing only
> >> >>>>>>> non-masked epilogue vectoriziation?
> >> >>>>>>>
> >> >>>>>>> Thanks,
> >> >>>>>>> Richard.
> >> >>>>>>>
> >> >>>>>>>> Thanks.
> >> >>>>>>>> Yuri.
> >> >>>>>>>>
> >> >>>>>>>> 2016-11-14 15:51 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> >>>>>>>> > On Fri, 11 Nov 2016, Yuri Rumyantsev wrote:
> >> >>>>>>>> >
> >> >>>>>>>> >> Richard,
> >> >>>>>>>> >>
> >> >>>>>>>> >> I prepare updated 3 patch with passing additional argument to
> >> >>>>>>>> >> vect_analyze_loop as you proposed (untested).
> >> >>>>>>>> >>
> >> >>>>>>>> >> You wrote:
> >> >>>>>>>> >> tw, I wonder if you can produce a single patch containing just
> >> >>>>>>>> >> epilogue vectorization, that is combine patches 1-3 but rip out
> >> >>>>>>>> >> changes only needed by later patches?
> >> >>>>>>>> >>
> >> >>>>>>>> >> Did you mean that I exclude all support for vectorization
> >> >>>>>>epilogues,
> >> >>>>>>>> >> i.e. exclude from 2-nd patch all non-related changes
> >> >>>>>>>> >> like
> >> >>>>>>>> >>
> >> >>>>>>>> >> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> >> >>>>>>>> >> index 11863af..32011c1 100644
> >> >>>>>>>> >> --- a/gcc/tree-vect-loop.c
> >> >>>>>>>> >> +++ b/gcc/tree-vect-loop.c
> >> >>>>>>>> >> @@ -1120,6 +1120,12 @@ new_loop_vec_info (struct loop *loop)
> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_NITER (res) = false;
> >> >>>>>>>> >>    LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
> >> >>>>>>>> >> +  LOOP_VINFO_CAN_BE_MASKED (res) = false;
> >> >>>>>>>> >> +  LOOP_VINFO_REQUIRED_MASKS (res) = 0;
> >> >>>>>>>> >> +  LOOP_VINFO_COMBINE_EPILOGUE (res) = false;
> >> >>>>>>>> >> +  LOOP_VINFO_MASK_EPILOGUE (res) = false;
> >> >>>>>>>> >> +  LOOP_VINFO_NEED_MASKING (res) = false;
> >> >>>>>>>> >> +  LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL;
> >> >>>>>>>> >
> >> >>>>>>>> > Yes.
> >> >>>>>>>> >
> >> >>>>>>>> >> Did you mean also that new combined patch must be working patch,
> >> >>>>>>i.e.
> >> >>>>>>>> >> can be integrated without other patches?
> >> >>>>>>>> >
> >> >>>>>>>> > Yes.
> >> >>>>>>>> >
> >> >>>>>>>> >> Could you please look at updated patch?
> >> >>>>>>>> >
> >> >>>>>>>> > Will do.
> >> >>>>>>>> >
> >> >>>>>>>> > Thanks,
> >> >>>>>>>> > Richard.
> >> >>>>>>>> >
> >> >>>>>>>> >> Thanks.
> >> >>>>>>>> >> Yuri.
> >> >>>>>>>> >>
> >> >>>>>>>> >> 2016-11-10 15:36 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> >>>>>>>> >> > On Thu, 10 Nov 2016, Richard Biener wrote:
> >> >>>>>>>> >> >
> >> >>>>>>>> >> >> On Tue, 8 Nov 2016, Yuri Rumyantsev wrote:
> >> >>>>>>>> >> >>
> >> >>>>>>>> >> >> > Richard,
> >> >>>>>>>> >> >> >
> >> >>>>>>>> >> >> > Here is updated 3 patch.
> >> >>>>>>>> >> >> >
> >> >>>>>>>> >> >> > I checked that all new tests related to epilogue
> >> >>>>>>vectorization passed with it.
> >> >>>>>>>> >> >> >
> >> >>>>>>>> >> >> > Your comments will be appreciated.
> >> >>>>>>>> >> >>
> >> >>>>>>>> >> >> A lot better now.  Instead of the ->aux dance I now prefer to
> >> >>>>>>>> >> >> pass the original loops loop_vinfo to vect_analyze_loop as
> >> >>>>>>>> >> >> optional argument (if non-NULL we analyze the epilogue of that
> >> >>>>>>>> >> >> loop_vinfo).  OTOH I remember we mainly use it to get at the
> >> >>>>>>>> >> >> original vectorization factor?  So we can pass down an
> >> >>>>>>(optional)
> >> >>>>>>>> >> >> forced vectorization factor as well?
> >> >>>>>>>> >> >
> >> >>>>>>>> >> > Btw, I wonder if you can produce a single patch containing just
> >> >>>>>>>> >> > epilogue vectorization, that is combine patches 1-3 but rip out
> >> >>>>>>>> >> > changes only needed by later patches?
> >> >>>>>>>> >> >
> >> >>>>>>>> >> > Thanks,
> >> >>>>>>>> >> > Richard.
> >> >>>>>>>> >> >
> >> >>>>>>>> >> >> Richard.
> >> >>>>>>>> >> >>
> >> >>>>>>>> >> >> > 2016-11-08 15:38 GMT+03:00 Richard Biener
> >> >>>>>><rguenther@suse.de>:
> >> >>>>>>>> >> >> > > On Thu, 3 Nov 2016, Yuri Rumyantsev wrote:
> >> >>>>>>>> >> >> > >
> >> >>>>>>>> >> >> > >> Hi Richard,
> >> >>>>>>>> >> >> > >>
> >> >>>>>>>> >> >> > >> I did not understand your last remark:
> >> >>>>>>>> >> >> > >>
> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
> >> >>>>>>>> >> >> > >> >           && dump_enabled_p ())
> >> >>>>>>>> >> >> > >> >           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
> >> >>>>>>vect_location,
> >> >>>>>>>> >> >> > >> >                            "loop vectorized\n");
> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
> >> >>>>>>>> >> >> > >> >        /* Now that the loop has been vectorized, allow
> >> >>>>>>it to be unrolled
> >> >>>>>>>> >> >> > >> >           etc.  */
> >> >>>>>>>> >> >> > >> >      loop->force_vectorize = false;
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
> >> >>>>>>it easier
> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
> >> >>>>>>in dumps
> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
> >> >>>>>>*/
> >> >>>>>>>> >> >> > >> > +       if (new_loop)
> >> >>>>>>>> >> >> > >> > +         {
> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
> >> >>>>>>>> >> >> > >> > +         }
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
> >> >>>>>>new_loop)
> >> >>>>>>>> >> >> > >> f> unction which will set up stuff properly (and also
> >> >>>>>>perform
> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
> >> >>>>>>vectorization
> >> >>>>>>>> >> >> > >> > separately that would be great.
> >> >>>>>>>> >> >> > >>
> >> >>>>>>>> >> >> > >> Could you please clarify your proposal.
> >> >>>>>>>> >> >> > >
> >> >>>>>>>> >> >> > > When a loop was vectorized set things up to immediately
> >> >>>>>>vectorize
> >> >>>>>>>> >> >> > > its epilogue, avoiding changing the loop iteration and
> >> >>>>>>avoiding
> >> >>>>>>>> >> >> > > the re-use of ->aux.
> >> >>>>>>>> >> >> > >
> >> >>>>>>>> >> >> > > Richard.
> >> >>>>>>>> >> >> > >
> >> >>>>>>>> >> >> > >> Thanks.
> >> >>>>>>>> >> >> > >> Yuri.
> >> >>>>>>>> >> >> > >>
> >> >>>>>>>> >> >> > >> 2016-11-02 15:27 GMT+03:00 Richard Biener
> >> >>>>>><rguenther@suse.de>:
> >> >>>>>>>> >> >> > >> > On Tue, 1 Nov 2016, Yuri Rumyantsev wrote:
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> >> Hi All,
> >> >>>>>>>> >> >> > >> >>
> >> >>>>>>>> >> >> > >> >> I re-send all patches sent by Ilya earlier for review
> >> >>>>>>which support
> >> >>>>>>>> >> >> > >> >> vectorization of loop epilogues and loops with low
> >> >>>>>>trip count. We
> >> >>>>>>>> >> >> > >> >> assume that the only patch -
> >> >>>>>>vec-tails-07-combine-tail.patch - was not
> >> >>>>>>>> >> >> > >> >> approved by Jeff.
> >> >>>>>>>> >> >> > >> >>
> >> >>>>>>>> >> >> > >> >> I did re-base of all patches and performed
> >> >>>>>>bootstrapping and
> >> >>>>>>>> >> >> > >> >> regression testing that did not show any new failures.
> >> >>>>>>Also all
> >> >>>>>>>> >> >> > >> >> changes related to new vect_do_peeling algorithm have
> >> >>>>>>been changed
> >> >>>>>>>> >> >> > >> >> accordingly.
> >> >>>>>>>> >> >> > >> >>
> >> >>>>>>>> >> >> > >> >> Is it OK for trunk?
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > I would have prefered that the series up to
> >> >>>>>>-03-nomask-tails would
> >> >>>>>>>> >> >> > >> > _only_ contain epilogue loop vectorization changes but
> >> >>>>>>unfortunately
> >> >>>>>>>> >> >> > >> > the patchset is oddly separated.
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > I have a comment on that part nevertheless:
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > @@ -1608,7 +1614,10 @@ vect_enhance_data_refs_alignment
> >> >>>>>>(loop_vec_info
> >> >>>>>>>> >> >> > >> > loop_vinfo)
> >> >>>>>>>> >> >> > >> >    /* Check if we can possibly peel the loop.  */
> >> >>>>>>>> >> >> > >> >    if (!vect_can_advance_ivs_p (loop_vinfo)
> >> >>>>>>>> >> >> > >> >        || !slpeel_can_duplicate_loop_p (loop,
> >> >>>>>>single_exit (loop))
> >> >>>>>>>> >> >> > >> > -      || loop->inner)
> >> >>>>>>>> >> >> > >> > +      || loop->inner
> >> >>>>>>>> >> >> > >> > +      /* Required peeling was performed in prologue
> >> >>>>>>and
> >> >>>>>>>> >> >> > >> > +        is not required for epilogue.  */
> >> >>>>>>>> >> >> > >> > +      || LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> >> >>>>>>>> >> >> > >> >      do_peeling = false;
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> >    if (do_peeling
> >> >>>>>>>> >> >> > >> > @@ -1888,7 +1897,10 @@ vect_enhance_data_refs_alignment
> >> >>>>>>(loop_vec_info
> >> >>>>>>>> >> >> > >> > loop_vinfo)
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> >    do_versioning =
> >> >>>>>>>> >> >> > >> >         optimize_loop_nest_for_speed_p (loop)
> >> >>>>>>>> >> >> > >> > -       && (!loop->inner); /* FORNOW */
> >> >>>>>>>> >> >> > >> > +       && (!loop->inner) /* FORNOW */
> >> >>>>>>>> >> >> > >> > +        /* Required versioning was performed for the
> >> >>>>>>>> >> >> > >> > +          original loop and is not required for
> >> >>>>>>epilogue.  */
> >> >>>>>>>> >> >> > >> > +       && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> >    if (do_versioning)
> >> >>>>>>>> >> >> > >> >      {
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > please do that check in the single caller of this
> >> >>>>>>function.
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > Otherwise I still dislike the new ->aux use and I
> >> >>>>>>believe that simply
> >> >>>>>>>> >> >> > >> > passing down info from the processed parent would be
> >> >>>>>>_much_ cleaner.
> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
> >> >>>>>>>> >> >> > >> >             && dump_enabled_p ())
> >> >>>>>>>> >> >> > >> >            dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
> >> >>>>>>vect_location,
> >> >>>>>>>> >> >> > >> >                             "loop vectorized\n");
> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
> >> >>>>>>>> >> >> > >> >         /* Now that the loop has been vectorized, allow
> >> >>>>>>it to be unrolled
> >> >>>>>>>> >> >> > >> >            etc.  */
> >> >>>>>>>> >> >> > >> >         loop->force_vectorize = false;
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
> >> >>>>>>it easier
> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
> >> >>>>>>in dumps
> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
> >> >>>>>>*/
> >> >>>>>>>> >> >> > >> > +       if (new_loop)
> >> >>>>>>>> >> >> > >> > +         {
> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
> >> >>>>>>>> >> >> > >> > +         }
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
> >> >>>>>>new_loop)
> >> >>>>>>>> >> >> > >> > function which will set up stuff properly (and also
> >> >>>>>>perform
> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
> >> >>>>>>vectorization
> >> >>>>>>>> >> >> > >> > separately that would be great.
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > I'm still torn about all the rest of the stuff and
> >> >>>>>>question its
> >> >>>>>>>> >> >> > >> > usability (esp. merging the epilogue with the main
> >> >>>>>>vector loop).
> >> >>>>>>>> >> >> > >> > But it has already been approved ... oh well.
> >> >>>>>>>> >> >> > >> >
> >> >>>>>>>> >> >> > >> > Thanks,
> >> >>>>>>>> >> >> > >> > Richard.
> >> >>>>>>>> >> >> > >>
> >> >>>>>>>> >> >> > >>
> >> >>>>>>>> >> >> > >
> >> >>>>>>>> >> >> > > --
> >> >>>>>>>> >> >> > > Richard Biener <rguenther@suse.de>
> >> >>>>>>>> >> >> > > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard,
> >> >>>>>>Graham Norton, HRB 21284 (AG Nuernberg)
> >> >>>>>>>> >> >> >
> >> >>>>>>>> >> >>
> >> >>>>>>>> >> >>
> >> >>>>>>>> >> >
> >> >>>>>>>> >> > --
> >> >>>>>>>> >> > Richard Biener <rguenther@suse.de>
> >> >>>>>>>> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
> >> >>>>>>>> >>
> >> >>>>>>>> >
> >> >>>>>>>> > --
> >> >>>>>>>> > Richard Biener <rguenther@suse.de>
> >> >>>>>>>> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
> >> >>>>>>>>
> >> >>>>>>>
> >> >>>>>>> --
> >> >>>>>>> Richard Biener <rguenther@suse.de>
> >> >>>>>>> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
> >> >>>>>
> >> >>>>>
> >>
> >
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
>

Comments

Yuri Rumyantsev Dec. 1, 2016, 2:27 p.m. UTC | #1
Thanks Richard for your comments.

You asked me about possible performance improvements for AVX2 machines
- we did not see any visible speed-up for spec2k with any method of
masking, including epilogue masking and combining, only on AVX512
machine aka knl.

I will answer on your question later.

Best regards.
Yuri

2016-12-01 14:33 GMT+03:00 Richard Biener <rguenther@suse.de>:
> On Mon, 28 Nov 2016, Yuri Rumyantsev wrote:
>
>> Richard!
>>
>> I attached vect dump for hte part of attached test-case which
>> illustrated how vectorization of epilogues works through masking:
>> #define SIZE 1023
>> #define ALIGN 64
>>
>> extern int posix_memalign(void **memptr, __SIZE_TYPE__ alignment,
>> __SIZE_TYPE__ size) __attribute__((weak));
>> extern void free (void *);
>>
>> void __attribute__((noinline))
>> test_citer (int * __restrict__ a,
>>    int * __restrict__ b,
>>    int * __restrict__ c)
>> {
>>   int i;
>>
>>   a = (int *)__builtin_assume_aligned (a, ALIGN);
>>   b = (int *)__builtin_assume_aligned (b, ALIGN);
>>   c = (int *)__builtin_assume_aligned (c, ALIGN);
>>
>>   for (i = 0; i < SIZE; i++)
>>     c[i] = a[i] + b[i];
>> }
>>
>> It was compiled with -mavx2 --param vect-epilogues-mask=1 options.
>>
>> I did not include in this patch vectorization of low trip-count loops
>> since in the original patch additional parameter was introduced:
>> +DEFPARAM (PARAM_VECT_SHORT_LOOPS,
>> +  "vect-short-loops",
>> +  "Enable vectorization of low trip count loops using masking.",
>> +  0, 0, 1)
>>
>> I assume that this ability can be included very quickly but it
>> requires cost model enhancements also.
>
> Comments on the patch itself (as I'm having a closer look again,
> I know how it vectorizes the above but I wondered why epilogue
> and short-trip loops are not basically the same code path).
>
> Btw, I don't like that the features are behind a --param paywall.
> That just means a) nobody will use it, b) it will bit-rot quickly,
> c) bugs are well-hidden.
>
> +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
> +      && integer_zerop (nested_in_vect_loop
> +                       ? STMT_VINFO_DR_STEP (stmt_info)
> +                       : DR_STEP (dr)))
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_NOTE, vect_location,
> +                        "allow invariant load for masked loop.\n");
> +    }
>
> this can test memory_access_type == VMAT_INVARIANT.  Please put
> all the checks in a common
>
>   if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>     {
>        if (memory_access_type == VMAT_INVARIANT)
>          {
>          }
>        else if (...)
>          {
>             LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>          }
>        else if (..)
> ...
>     }
>
> @@ -6667,6 +6756,15 @@ vectorizable_load (gimple *stmt,
> gimple_stmt_iterator *gsi, gimple **vec_stmt,
>        gcc_assert (!nested_in_vect_loop);
>        gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
>
> +      if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "cannot be masked: grouped access is not"
> +                            " supported.");
> +         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> +      }
> +
>
> isn't this already handled by the above?  Or rather the general
> disallowance of SLP?
>
> @@ -5730,6 +5792,24 @@ vectorizable_store (gimple *stmt,
> gimple_stmt_iterator *gsi, gimple **vec_stmt,
>                             &memory_access_type, &gs_info))
>      return false;
>
> +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
> +      && memory_access_type != VMAT_CONTIGUOUS)
> +    {
> +      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "cannot be masked: unsupported memory access
> type.\n");
> +    }
> +
> +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
> +      && !can_mask_load_store (stmt))
> +    {
> +      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "cannot be masked: unsupported masked store.\n");
> +    }
> +
>
> likewise please combine the ifs.
>
> @@ -2354,7 +2401,10 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
>                                           ptr, vec_mask, vec_rhs);
>           vect_finish_stmt_generation (stmt, new_stmt, gsi);
>           if (i == 0)
> -           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> +           {
> +             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> +             STMT_VINFO_FIRST_COPY_P (vinfo_for_stmt (new_stmt)) = true;
> +           }
>           else
>             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
>           prev_stmt_info = vinfo_for_stmt (new_stmt);
>
> here you only set the flag, elsewhere you copy DR and VECTYPE as well.
>
> @@ -2113,6 +2146,20 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
>                && !useless_type_conversion_p (vectype, rhs_vectype)))
>      return false;
>
> +  if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +    {
> +      /* Check that mask conjuction is supported.  */
> +      optab tab;
> +      tab = optab_for_tree_code (BIT_AND_EXPR, vectype, optab_default);
> +      if (!tab || optab_handler (tab, TYPE_MODE (vectype)) ==
> CODE_FOR_nothing)
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "cannot be masked: unsupported mask
> operation\n");
> +         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> +       }
> +    }
>
> does this really test whether we can bit-and the mask?  You are
> using the vector type of the store (which might be V2DF for example),
> also for AVX512 it might be a vector-bool type with integer mode?
> Of course we maybe can simply assume mask conjunction is available
> (I know no ISA where that would be not true).
>
> +/* Return true if STMT can be converted to masked form.  */
> +
> +static bool
> +can_mask_load_store (gimple *stmt)
> +{
> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +  tree vectype, mask_vectype;
> +  tree lhs, ref;
> +
> +  if (!stmt_info)
> +    return false;
> +  lhs = gimple_assign_lhs (stmt);
> +  ref = (TREE_CODE (lhs) == SSA_NAME) ? gimple_assign_rhs1 (stmt) : lhs;
> +  if (may_be_nonaddressable_p (ref))
> +    return false;
> +  vectype = STMT_VINFO_VECTYPE (stmt_info);
>
> You probably modeled this after ifcvt_can_use_mask_load_store but I
> don't think checking may_be_nonaddressable_p is necessary (we couldn't
> even vectorize such refs).  stmt_info should never be NULL either.
> With the check removed tree-ssa-loop-ivopts.h should no longer be
> necessary.
>
> +static void
> +vect_mask_load_store_stmt (gimple *stmt, tree vectype, tree mask,
> +                          data_reference *dr, gimple_stmt_iterator *si)
> +{
> ...
> +  addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (mem),
> +                                  true, NULL_TREE, true,
> +                                  GSI_SAME_STMT);
> +
> +  align = TYPE_ALIGN_UNIT (vectype);
> +  if (aligned_access_p (dr))
> +    misalign = 0;
> +  else if (DR_MISALIGNMENT (dr) == -1)
> +    {
> +      align = TYPE_ALIGN_UNIT (elem_type);
> +      misalign = 0;
> +    }
> +  else
> +    misalign = DR_MISALIGNMENT (dr);
> +  set_ptr_info_alignment (get_ptr_info (addr), align, misalign);
> +  ptr = build_int_cst (reference_alias_ptr_type (mem),
> +                      misalign ? misalign & -misalign : align);
>
> you should simply use
>
>   align = get_object_alignment (mem) / BITS_PER_UNIT;
>
> here rather than trying to be clever.  Eventually you don't need
> the DR then (see question above).
>
> +    }
> +  gsi_replace (si ? si : &gsi, new_stmt, false);
>
> when you replace the load/store please previously copy VUSE and VDEF
> from the original one (we were nearly clean enough to no longer
> require a virtual operand rewrite after vectorization...)  Thus
>
>   gimple_set_vuse (new_stmt, gimple_vuse (stmt));
>   gimple_set_vdef (new_stmt, gimple_vdef (stmt));
>
> +static void
> +vect_mask_loop (loop_vec_info loop_vinfo)
> +{
> ...
> +  /* Scan all loop statements to convert vector load/store including
> masked
> +     form.  */
> +  for (unsigned i = 0; i < loop->num_nodes; i++)
> +    {
> +      basic_block bb = bbs[i];
> +      for (gimple_stmt_iterator si = gsi_start_bb (bb);
> +          !gsi_end_p (si); gsi_next (&si))
> +       {
> +         gimple *stmt = gsi_stmt (si);
> +         stmt_vec_info stmt_info = NULL;
> +         tree vectype = NULL;
> +         data_reference *dr;
> +
> +         /* Mask load case.  */
> +         if (is_gimple_call (stmt)
> +             && gimple_call_internal_p (stmt)
> +             && gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
> +             && !VECTOR_TYPE_P (TREE_TYPE (gimple_call_arg (stmt, 2))))
> +           {
> ...
> +             /* Skip invariant loads.  */
> +             if (integer_zerop (nested_in_vect_loop_p (loop, stmt)
> +                                ? STMT_VINFO_DR_STEP (stmt_info)
> +                                : DR_STEP (STMT_VINFO_DATA_REF
> (stmt_info))))
> +               continue;
>
> seeing this it would be nice if stmt_info had a flag for whether
> the stmt needs masking (and a flag on wheter this is a scalar or a
> vectorized stmt).
>
> +         /* Skip hoisted out statements.  */
> +         if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
> +           continue;
>
> err, you walk stmts in the loop!  Isn't this covered by the above
> skipping of 'invariant loads'?
>
> +static gimple *
> +vect_mask_reduction_stmt (gimple *stmt, tree mask, gimple *prev)
> +{
>
> depending on the reduction operand there are variants that
> could get away w/o the VEC_COND_EXPR, like
>
>   S1': tem_4 = d_3 & MASK;
>   S2': r_1 = r_2 + tem_4;
>
> which works for plus at least.  More generally doing
>
>   S1': tem_4 = VEC_COND_EXPR<MASK, d_3, neutral operand>
>   S2': r_1 = r_2 OP tem_4;
>
> and leaving optimization to & to later opts (& won't work for
> AVX512 mask registers I guess).
>
> Good enough for later enhacement of course.
>
> +static void
> +vect_gen_ivs_for_masking (loop_vec_info loop_vinfo, vec<tree> *ivs)
> +{
> ...
>
> isn't it enough to always create a single IV and derive the
> additional copies by IV + i * { elems, elems, elems ... }?
> IVs are expensive -- I'm sure we can optimize the rest of the
> scheme further as well but this one looks obvious to me.
>
> @@ -3225,12 +3508,32 @@ vect_estimate_min_profitable_iters (loop_vec_info
> loop_vinfo,
>    int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
>
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> +    {
> +      /* Currently we don't produce scalar epilogue version in case
> +        its masked version is provided.  It means we don't need to
> +        compute profitability one more time here.  Just make a
> +        masked loop version.  */
> +      if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
> +         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
> +       {
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                          "cost model: mask loop epilogue.\n");
> +         LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
> +         *ret_min_profitable_niters = 0;
> +         *ret_min_profitable_estimate = 0;
> +         return;
> +       }
> +    }
>    /* Cost model disabled.  */
> -  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
> +  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>      {
>        dump_printf_loc (MSG_NOTE, vect_location, "cost model
> disabled.\n");
>        *ret_min_profitable_niters = 0;
>        *ret_min_profitable_estimate = 0;
> +      if (PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK)
> +         && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
>        return;
>      }
>
> the unlimited_cost_model case should come first?  OTOH masking or
> not is probably not sth covered by 'unlimited' - that is about
> vectorizing or not.  But the above code means that for
> epilogue vectorization w/o masking we ignore unlimited_cost_model ()?
> That doesn't make sense to me.
>
> Plus if this is short-trip or epilogue vectorization and the
> cost model is _not_ unlimited then we dont' want to enable
> masking always (if it is possible).  It might be we statically
> know the epilogue executes for at most two iterations for example.
>
> I don't see _any_ cost model for vectorizing the epilogue with
> masking?  Am I missing something?  A "trivial" cost model
> should at least consider the additional IV(s), the mask
> compute and the widening and narrowing ops required.
>
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index e13d6a2..36be342 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -1635,6 +1635,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> niters, tree nitersm1,
>    bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
>                          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
>
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> +    {
> +      prolog_peeling = false;
> +      if (LOOP_VINFO_MASK_LOOP (loop_vinfo))
> +       epilog_peeling = false;
> +    }
> +
>    if (!prolog_peeling && !epilog_peeling)
>      return NULL;
>
> I think the prolog_peeling was fixed during the epilogue vectorization
> review and should no longer be necessary.  Please add
> a && ! LOOP_VINFO_MASK_LOOP () to the epilog_peeling init instead
> (it should also work for short-trip loop vectorization).
>
> @@ -2022,11 +2291,18 @@ start_over:
>        || (max_niter != -1
>           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
>      {
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "not vectorized: iteration count smaller than "
> -                        "vectorization factor.\n");
> -      return false;
> +      /* Allow low trip count for loop epilogue we want to mask.  */
> +      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
> +       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
> +      else
> +       {
> +         if (dump_enabled_p ())
>
> so why do we test only LOOP_VINFO_EPILOGUE_P here?  All the code
> I saw sofar would also work for the main loop (but the cost
> model is missing).
>
> I am missing testcases.  There's only a single one but we should
> have cases covering all kinds of mask IV widths and widen/shorten
> masks.
>
> Do you have any numbers on SPEC 2k6 with epilogue vect and/or masking
> enabled for an AVX2 machine?
>
> Oh, and I really dislike the --param paywall.
>
> Thanks,
> Richard.
>
>> Best regards.
>> Yuri.
>>
>>
>> 2016-11-28 17:39 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> > On Thu, 24 Nov 2016, Yuri Rumyantsev wrote:
>> >
>> >> Hi All,
>> >>
>> >> Here is the second patch which supports epilogue vectorization using
>> >> masking without cost model. Currently it is possible
>> >> only with passing parameter "--param vect-epilogues-mask=1".
>> >>
>> >> Bootstrapping and regression testing did not show any new regression.
>> >>
>> >> Any comments will be appreciated.
>> >
>> > Going over the patch the main question is one how it works -- it looks
>> > like the decision whether to vectorize & mask the epilogue is made
>> > when vectorizing the loop that generates the epilogue rather than
>> > in the epilogue vectorization path?
>> >
>> > That is, I'd have expected to see this handling low-trip count loops
>> > by masking?  And thus masking the epilogue simply by it being
>> > low-trip count?
>> >
>> > Richard.
>> >
>> >> ChangeLog:
>> >> 2016-11-24  Yuri Rumyantsev  <ysrumyan@gmail.com>
>> >>
>> >> * params.def (PARAM_VECT_EPILOGUES_MASK): New.
>> >> * tree-vect-data-refs.c (vect_get_new_ssa_name): Support vect_mask_var.
>> >> * tree-vect-loop.c: Include insn-config.h, recog.h and alias.h.
>> >> (new_loop_vec_info): Add zeroing can_be_masked, mask_loop and
>> >> required_mask fields.
>> >> (vect_check_required_masks_widening): New.
>> >> (vect_check_required_masks_narrowing): New.
>> >> (vect_get_masking_iv_elems): New.
>> >> (vect_get_masking_iv_type): New.
>> >> (vect_get_extreme_masks): New.
>> >> (vect_check_required_masks): New.
>> >> (vect_analyze_loop_operations): Call vect_check_required_masks if all
>> >> statements can be masked.
>> >> (vect_analyze_loop_2): Inititalize to zero min_scalar_loop_bound.
>> >> Add check that epilogue can be masked with the same vf with issue
>> >> fail notes.  Allow epilogue vectorization through masking of low trip
>> >> loops. Set to true can_be_masked field before loop operation analysis.
>> >> Do not set-up min_scalar_loop_bound for epilogue vectorization through
>> >> masking.  Do not peeling for epilogue masking.  Reset can_be_masked
>> >> field before repeat analysis.
>> >> (vect_estimate_min_profitable_iters): Do not compute profitability
>> >> for epilogue masking.  Set up mask_loop filed to true if parameter
>> >> PARAM_VECT_EPILOGUES_MASK is non-zero.
>> >> (vectorizable_reduction): Add check that statement can be masked.
>> >> (vectorizable_induction): Do not support masking for induction.
>> >> (vect_gen_ivs_for_masking): New.
>> >> (vect_get_mask_index_for_elems): New.
>> >> (vect_get_mask_index_for_type): New.
>> >> (vect_create_narrowed_masks): New.
>> >> (vect_create_widened_masks): New.
>> >> (vect_gen_loop_masks): New.
>> >> (vect_mask_reduction_stmt): New.
>> >> (vect_mask_mask_load_store_stmt): New.
>> >> (vect_mask_load_store_stmt): New.
>> >> (vect_mask_loop): New.
>> >> (vect_transform_loop): Invoke vect_mask_loop if required.
>> >> Use div_ceil to recompute upper bounds for masked loops.  Issue
>> >> statistics for epilogue vectorization through masking. Do not reduce
>> >> vf for masking epilogue.
>> >> * tree-vect-stmts.c: Include tree-ssa-loop-ivopts.h.
>> >> (can_mask_load_store): New.
>> >> (vectorizable_mask_load_store): Check that mask conjuction is
>> >> supported.  Set-up first_copy_p field of stmt_vinfo.
>> >> (vectorizable_simd_clone_call): Check that simd clone can not be
>> >> masked.
>> >> (vectorizable_store): Check that store can be masked. Mark the first
>> >> copy of generated vector stores and provide it with vectype and the
>> >> original data reference.
>> >> (vectorizable_load): Check that load can be masked.
>> >> (vect_stmt_should_be_masked_for_epilogue): New.
>> >> (vect_add_required_mask_for_stmt): New.
>> >> (vect_analyze_stmt): Add check on unsupported statements for masking
>> >> with printing message.
>> >> * tree-vectorizer.h (struct _loop_vec_info): Add new fields
>> >> can_be_maske, required_masks, masl_loop.
>> >> (LOOP_VINFO_CAN_BE_MASKED): New.
>> >> (LOOP_VINFO_REQUIRED_MASKS): New.
>> >> (LOOP_VINFO_MASK_LOOP): New.
>> >> (struct _stmt_vec_info): Add first_copy_p field.
>> >> (STMT_VINFO_FIRST_COPY_P): New.
>> >>
>> >> gcc/testsuite/
>> >>
>> >> * gcc.dg/vect/vect-tail-mask-1.c: New test.
>> >>
>> >> 2016-11-18 18:54 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
>> >> > On 18 November 2016 at 16:46, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> >> >> It is very strange that this test failed on arm, since it requires
>> >> >> target avx2 to check vectorizer dumps:
>> >> >>
>> >> >> /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" {
>> >> >> target avx2_runtime } } } */
>> >> >> /* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED
>> >> >> \\(VS=16\\)" 2 "vect" { target avx2_runtime } } } */
>> >> >>
>> >> >> Could you please clarify what is the reason of the failure?
>> >> >
>> >> > It's not the scan-dumps that fail, but the execution.
>> >> > The test calls abort() for some reason.
>> >> >
>> >> > It will take me a while to rebuild the test manually in the right
>> >> > debug environment to provide you with more traces.
>> >> >
>> >> >
>> >> >
>> >> >>
>> >> >> Thanks.
>> >> >>
>> >> >> 2016-11-18 16:20 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
>> >> >>> On 15 November 2016 at 15:41, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> >> >>>> Hi All,
>> >> >>>>
>> >> >>>> Here is patch for non-masked epilogue vectoriziation.
>> >> >>>>
>> >> >>>> Bootstrap and regression testing did not show any new failures.
>> >> >>>>
>> >> >>>> Is it OK for trunk?
>> >> >>>>
>> >> >>>> Thanks.
>> >> >>>> Changelog:
>> >> >>>>
>> >> >>>> 2016-11-15  Yuri Rumyantsev  <ysrumyan@gmail.com>
>> >> >>>>
>> >> >>>> * params.def (PARAM_VECT_EPILOGUES_NOMASK): New.
>> >> >>>> * tree-if-conv.c (tree_if_conversion): Make public.
>> >> >>>> * * tree-if-conv.h: New file.
>> >> >>>> * tree-vect-data-refs.c (vect_analyze_data_ref_dependences) Avoid
>> >> >>>> dynamic alias checks for epilogues.
>> >> >>>> * tree-vect-loop-manip.c (vect_do_peeling): Return created epilog.
>> >> >>>> * tree-vect-loop.c: include tree-if-conv.h.
>> >> >>>> (new_loop_vec_info): Add zeroing orig_loop_info field.
>> >> >>>> (vect_analyze_loop_2): Don't try to enhance alignment for epilogues.
>> >> >>>> (vect_analyze_loop): Add argument ORIG_LOOP_INFO which is not NULL
>> >> >>>> if epilogue is vectorized, set up orig_loop_info field of loop_vinfo
>> >> >>>> using passed argument.
>> >> >>>> (vect_transform_loop): Check if created epilogue should be returned
>> >> >>>> for further vectorization with less vf.  If-convert epilogue if
>> >> >>>> required. Print vectorization success for epilogue.
>> >> >>>> * tree-vectorizer.c (vectorize_loops): Add epilogue vectorization
>> >> >>>> if it is required, pass loop_vinfo produced during vectorization of
>> >> >>>> loop body to vect_analyze_loop.
>> >> >>>> * tree-vectorizer.h (struct _loop_vec_info): Add new field
>> >> >>>> orig_loop_info.
>> >> >>>> (LOOP_VINFO_ORIG_LOOP_INFO): New.
>> >> >>>> (LOOP_VINFO_EPILOGUE_P): New.
>> >> >>>> (LOOP_VINFO_ORIG_VECT_FACTOR): New.
>> >> >>>> (vect_do_peeling): Change prototype to return epilogue.
>> >> >>>> (vect_analyze_loop): Add argument of loop_vec_info type.
>> >> >>>> (vect_transform_loop): Return created loop.
>> >> >>>>
>> >> >>>> gcc/testsuite/
>> >> >>>>
>> >> >>>> * lib/target-supports.exp (check_avx2_hw_available): New.
>> >> >>>> (check_effective_target_avx2_runtime): New.
>> >> >>>> * gcc.dg/vect/vect-tail-nomask-1.c: New test.
>> >> >>>>
>> >> >>>
>> >> >>> Hi,
>> >> >>>
>> >> >>> This new test fails on arm-none-eabi (using default cpu/fpu/mode):
>> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c -flto -ffat-lto-objects execution test
>> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c execution test
>> >> >>>
>> >> >>> It does pass on the same target if configured --with-cpu=cortex-a9.
>> >> >>>
>> >> >>> Christophe
>> >> >>>
>> >> >>>
>> >> >>>
>> >> >>>>
>> >> >>>> 2016-11-14 20:04 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >> >>>>> On November 14, 2016 4:39:40 PM GMT+01:00, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> >> >>>>>>Richard,
>> >> >>>>>>
>> >> >>>>>>I checked one of the tests designed for epilogue vectorization using
>> >> >>>>>>patches 1 - 3 and found out that build compiler performs vectorization
>> >> >>>>>>of epilogues with --param vect-epilogues-nomask=1 passed:
>> >> >>>>>>
>> >> >>>>>>$ gcc -Ofast -mavx2 t1.c -S --param vect-epilogues-nomask=1 -o
>> >> >>>>>>t1.new-nomask.s -fdump-tree-vect-details
>> >> >>>>>>$ grep VECTORIZED -c t1.c.156t.vect
>> >> >>>>>>4
>> >> >>>>>> Without param only 2 loops are vectorized.
>> >> >>>>>>
>> >> >>>>>>Should I simply add a part of tests related to this feature or I must
>> >> >>>>>>delete all not necessary changes also?
>> >> >>>>>
>> >> >>>>> Please remove all not necessary changes.
>> >> >>>>>
>> >> >>>>> Richard.
>> >> >>>>>
>> >> >>>>>>Thanks.
>> >> >>>>>>Yuri.
>> >> >>>>>>
>> >> >>>>>>2016-11-14 16:40 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >> >>>>>>> On Mon, 14 Nov 2016, Yuri Rumyantsev wrote:
>> >> >>>>>>>
>> >> >>>>>>>> Richard,
>> >> >>>>>>>>
>> >> >>>>>>>> In my previous patch I forgot to remove couple lines related to aux
>> >> >>>>>>field.
>> >> >>>>>>>> Here is the correct updated patch.
>> >> >>>>>>>
>> >> >>>>>>> Yeah, I noticed.  This patch would be ok for trunk (together with
>> >> >>>>>>> necessary parts from 1 and 2) if all not required parts are removed
>> >> >>>>>>> (and you'd add the testcases covering non-masked tail vect).
>> >> >>>>>>>
>> >> >>>>>>> Thus, can you please produce a single complete patch containing only
>> >> >>>>>>> non-masked epilogue vectoriziation?
>> >> >>>>>>>
>> >> >>>>>>> Thanks,
>> >> >>>>>>> Richard.
>> >> >>>>>>>
>> >> >>>>>>>> Thanks.
>> >> >>>>>>>> Yuri.
>> >> >>>>>>>>
>> >> >>>>>>>> 2016-11-14 15:51 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >> >>>>>>>> > On Fri, 11 Nov 2016, Yuri Rumyantsev wrote:
>> >> >>>>>>>> >
>> >> >>>>>>>> >> Richard,
>> >> >>>>>>>> >>
>> >> >>>>>>>> >> I prepare updated 3 patch with passing additional argument to
>> >> >>>>>>>> >> vect_analyze_loop as you proposed (untested).
>> >> >>>>>>>> >>
>> >> >>>>>>>> >> You wrote:
>> >> >>>>>>>> >> tw, I wonder if you can produce a single patch containing just
>> >> >>>>>>>> >> epilogue vectorization, that is combine patches 1-3 but rip out
>> >> >>>>>>>> >> changes only needed by later patches?
>> >> >>>>>>>> >>
>> >> >>>>>>>> >> Did you mean that I exclude all support for vectorization
>> >> >>>>>>epilogues,
>> >> >>>>>>>> >> i.e. exclude from 2-nd patch all non-related changes
>> >> >>>>>>>> >> like
>> >> >>>>>>>> >>
>> >> >>>>>>>> >> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>> >> >>>>>>>> >> index 11863af..32011c1 100644
>> >> >>>>>>>> >> --- a/gcc/tree-vect-loop.c
>> >> >>>>>>>> >> +++ b/gcc/tree-vect-loop.c
>> >> >>>>>>>> >> @@ -1120,6 +1120,12 @@ new_loop_vec_info (struct loop *loop)
>> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
>> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_NITER (res) = false;
>> >> >>>>>>>> >>    LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
>> >> >>>>>>>> >> +  LOOP_VINFO_CAN_BE_MASKED (res) = false;
>> >> >>>>>>>> >> +  LOOP_VINFO_REQUIRED_MASKS (res) = 0;
>> >> >>>>>>>> >> +  LOOP_VINFO_COMBINE_EPILOGUE (res) = false;
>> >> >>>>>>>> >> +  LOOP_VINFO_MASK_EPILOGUE (res) = false;
>> >> >>>>>>>> >> +  LOOP_VINFO_NEED_MASKING (res) = false;
>> >> >>>>>>>> >> +  LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL;
>> >> >>>>>>>> >
>> >> >>>>>>>> > Yes.
>> >> >>>>>>>> >
>> >> >>>>>>>> >> Did you mean also that new combined patch must be working patch,
>> >> >>>>>>i.e.
>> >> >>>>>>>> >> can be integrated without other patches?
>> >> >>>>>>>> >
>> >> >>>>>>>> > Yes.
>> >> >>>>>>>> >
>> >> >>>>>>>> >> Could you please look at updated patch?
>> >> >>>>>>>> >
>> >> >>>>>>>> > Will do.
>> >> >>>>>>>> >
>> >> >>>>>>>> > Thanks,
>> >> >>>>>>>> > Richard.
>> >> >>>>>>>> >
>> >> >>>>>>>> >> Thanks.
>> >> >>>>>>>> >> Yuri.
>> >> >>>>>>>> >>
>> >> >>>>>>>> >> 2016-11-10 15:36 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >> >>>>>>>> >> > On Thu, 10 Nov 2016, Richard Biener wrote:
>> >> >>>>>>>> >> >
>> >> >>>>>>>> >> >> On Tue, 8 Nov 2016, Yuri Rumyantsev wrote:
>> >> >>>>>>>> >> >>
>> >> >>>>>>>> >> >> > Richard,
>> >> >>>>>>>> >> >> >
>> >> >>>>>>>> >> >> > Here is updated 3 patch.
>> >> >>>>>>>> >> >> >
>> >> >>>>>>>> >> >> > I checked that all new tests related to epilogue
>> >> >>>>>>vectorization passed with it.
>> >> >>>>>>>> >> >> >
>> >> >>>>>>>> >> >> > Your comments will be appreciated.
>> >> >>>>>>>> >> >>
>> >> >>>>>>>> >> >> A lot better now.  Instead of the ->aux dance I now prefer to
>> >> >>>>>>>> >> >> pass the original loops loop_vinfo to vect_analyze_loop as
>> >> >>>>>>>> >> >> optional argument (if non-NULL we analyze the epilogue of that
>> >> >>>>>>>> >> >> loop_vinfo).  OTOH I remember we mainly use it to get at the
>> >> >>>>>>>> >> >> original vectorization factor?  So we can pass down an
>> >> >>>>>>(optional)
>> >> >>>>>>>> >> >> forced vectorization factor as well?
>> >> >>>>>>>> >> >
>> >> >>>>>>>> >> > Btw, I wonder if you can produce a single patch containing just
>> >> >>>>>>>> >> > epilogue vectorization, that is combine patches 1-3 but rip out
>> >> >>>>>>>> >> > changes only needed by later patches?
>> >> >>>>>>>> >> >
>> >> >>>>>>>> >> > Thanks,
>> >> >>>>>>>> >> > Richard.
>> >> >>>>>>>> >> >
>> >> >>>>>>>> >> >> Richard.
>> >> >>>>>>>> >> >>
>> >> >>>>>>>> >> >> > 2016-11-08 15:38 GMT+03:00 Richard Biener
>> >> >>>>>><rguenther@suse.de>:
>> >> >>>>>>>> >> >> > > On Thu, 3 Nov 2016, Yuri Rumyantsev wrote:
>> >> >>>>>>>> >> >> > >
>> >> >>>>>>>> >> >> > >> Hi Richard,
>> >> >>>>>>>> >> >> > >>
>> >> >>>>>>>> >> >> > >> I did not understand your last remark:
>> >> >>>>>>>> >> >> > >>
>> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
>> >> >>>>>>>> >> >> > >> >           && dump_enabled_p ())
>> >> >>>>>>>> >> >> > >> >           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
>> >> >>>>>>vect_location,
>> >> >>>>>>>> >> >> > >> >                            "loop vectorized\n");
>> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
>> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
>> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
>> >> >>>>>>>> >> >> > >> >        /* Now that the loop has been vectorized, allow
>> >> >>>>>>it to be unrolled
>> >> >>>>>>>> >> >> > >> >           etc.  */
>> >> >>>>>>>> >> >> > >> >      loop->force_vectorize = false;
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
>> >> >>>>>>it easier
>> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
>> >> >>>>>>in dumps
>> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
>> >> >>>>>>*/
>> >> >>>>>>>> >> >> > >> > +       if (new_loop)
>> >> >>>>>>>> >> >> > >> > +         {
>> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
>> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
>> >> >>>>>>>> >> >> > >> > +         }
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
>> >> >>>>>>new_loop)
>> >> >>>>>>>> >> >> > >> f> unction which will set up stuff properly (and also
>> >> >>>>>>perform
>> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
>> >> >>>>>>vectorization
>> >> >>>>>>>> >> >> > >> > separately that would be great.
>> >> >>>>>>>> >> >> > >>
>> >> >>>>>>>> >> >> > >> Could you please clarify your proposal.
>> >> >>>>>>>> >> >> > >
>> >> >>>>>>>> >> >> > > When a loop was vectorized set things up to immediately
>> >> >>>>>>vectorize
>> >> >>>>>>>> >> >> > > its epilogue, avoiding changing the loop iteration and
>> >> >>>>>>avoiding
>> >> >>>>>>>> >> >> > > the re-use of ->aux.
>> >> >>>>>>>> >> >> > >
>> >> >>>>>>>> >> >> > > Richard.
>> >> >>>>>>>> >> >> > >
>> >> >>>>>>>> >> >> > >> Thanks.
>> >> >>>>>>>> >> >> > >> Yuri.
>> >> >>>>>>>> >> >> > >>
>> >> >>>>>>>> >> >> > >> 2016-11-02 15:27 GMT+03:00 Richard Biener
>> >> >>>>>><rguenther@suse.de>:
>> >> >>>>>>>> >> >> > >> > On Tue, 1 Nov 2016, Yuri Rumyantsev wrote:
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> >> Hi All,
>> >> >>>>>>>> >> >> > >> >>
>> >> >>>>>>>> >> >> > >> >> I re-send all patches sent by Ilya earlier for review
>> >> >>>>>>which support
>> >> >>>>>>>> >> >> > >> >> vectorization of loop epilogues and loops with low
>> >> >>>>>>trip count. We
>> >> >>>>>>>> >> >> > >> >> assume that the only patch -
>> >> >>>>>>vec-tails-07-combine-tail.patch - was not
>> >> >>>>>>>> >> >> > >> >> approved by Jeff.
>> >> >>>>>>>> >> >> > >> >>
>> >> >>>>>>>> >> >> > >> >> I did re-base of all patches and performed
>> >> >>>>>>bootstrapping and
>> >> >>>>>>>> >> >> > >> >> regression testing that did not show any new failures.
>> >> >>>>>>Also all
>> >> >>>>>>>> >> >> > >> >> changes related to new vect_do_peeling algorithm have
>> >> >>>>>>been changed
>> >> >>>>>>>> >> >> > >> >> accordingly.
>> >> >>>>>>>> >> >> > >> >>
>> >> >>>>>>>> >> >> > >> >> Is it OK for trunk?
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > I would have prefered that the series up to
>> >> >>>>>>-03-nomask-tails would
>> >> >>>>>>>> >> >> > >> > _only_ contain epilogue loop vectorization changes but
>> >> >>>>>>unfortunately
>> >> >>>>>>>> >> >> > >> > the patchset is oddly separated.
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > I have a comment on that part nevertheless:
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > @@ -1608,7 +1614,10 @@ vect_enhance_data_refs_alignment
>> >> >>>>>>(loop_vec_info
>> >> >>>>>>>> >> >> > >> > loop_vinfo)
>> >> >>>>>>>> >> >> > >> >    /* Check if we can possibly peel the loop.  */
>> >> >>>>>>>> >> >> > >> >    if (!vect_can_advance_ivs_p (loop_vinfo)
>> >> >>>>>>>> >> >> > >> >        || !slpeel_can_duplicate_loop_p (loop,
>> >> >>>>>>single_exit (loop))
>> >> >>>>>>>> >> >> > >> > -      || loop->inner)
>> >> >>>>>>>> >> >> > >> > +      || loop->inner
>> >> >>>>>>>> >> >> > >> > +      /* Required peeling was performed in prologue
>> >> >>>>>>and
>> >> >>>>>>>> >> >> > >> > +        is not required for epilogue.  */
>> >> >>>>>>>> >> >> > >> > +      || LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>> >> >>>>>>>> >> >> > >> >      do_peeling = false;
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> >    if (do_peeling
>> >> >>>>>>>> >> >> > >> > @@ -1888,7 +1897,10 @@ vect_enhance_data_refs_alignment
>> >> >>>>>>(loop_vec_info
>> >> >>>>>>>> >> >> > >> > loop_vinfo)
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> >    do_versioning =
>> >> >>>>>>>> >> >> > >> >         optimize_loop_nest_for_speed_p (loop)
>> >> >>>>>>>> >> >> > >> > -       && (!loop->inner); /* FORNOW */
>> >> >>>>>>>> >> >> > >> > +       && (!loop->inner) /* FORNOW */
>> >> >>>>>>>> >> >> > >> > +        /* Required versioning was performed for the
>> >> >>>>>>>> >> >> > >> > +          original loop and is not required for
>> >> >>>>>>epilogue.  */
>> >> >>>>>>>> >> >> > >> > +       && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> >    if (do_versioning)
>> >> >>>>>>>> >> >> > >> >      {
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > please do that check in the single caller of this
>> >> >>>>>>function.
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > Otherwise I still dislike the new ->aux use and I
>> >> >>>>>>believe that simply
>> >> >>>>>>>> >> >> > >> > passing down info from the processed parent would be
>> >> >>>>>>_much_ cleaner.
>> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
>> >> >>>>>>>> >> >> > >> >             && dump_enabled_p ())
>> >> >>>>>>>> >> >> > >> >            dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
>> >> >>>>>>vect_location,
>> >> >>>>>>>> >> >> > >> >                             "loop vectorized\n");
>> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
>> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
>> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
>> >> >>>>>>>> >> >> > >> >         /* Now that the loop has been vectorized, allow
>> >> >>>>>>it to be unrolled
>> >> >>>>>>>> >> >> > >> >            etc.  */
>> >> >>>>>>>> >> >> > >> >         loop->force_vectorize = false;
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
>> >> >>>>>>it easier
>> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
>> >> >>>>>>in dumps
>> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
>> >> >>>>>>*/
>> >> >>>>>>>> >> >> > >> > +       if (new_loop)
>> >> >>>>>>>> >> >> > >> > +         {
>> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
>> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
>> >> >>>>>>>> >> >> > >> > +         }
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
>> >> >>>>>>new_loop)
>> >> >>>>>>>> >> >> > >> > function which will set up stuff properly (and also
>> >> >>>>>>perform
>> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
>> >> >>>>>>vectorization
>> >> >>>>>>>> >> >> > >> > separately that would be great.
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > I'm still torn about all the rest of the stuff and
>> >> >>>>>>question its
>> >> >>>>>>>> >> >> > >> > usability (esp. merging the epilogue with the main
>> >> >>>>>>vector loop).
>> >> >>>>>>>> >> >> > >> > But it has already been approved ... oh well.
>> >> >>>>>>>> >> >> > >> >
>> >> >>>>>>>> >> >> > >> > Thanks,
>> >> >>>>>>>> >> >> > >> > Richard.
>> >> >>>>>>>> >> >> > >>
>> >> >>>>>>>> >> >> > >>
>> >> >>>>>>>> >> >> > >
>> >> >>>>>>>> >> >> > > --
>> >> >>>>>>>> >> >> > > Richard Biener <rguenther@suse.de>
>> >> >>>>>>>> >> >> > > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard,
>> >> >>>>>>Graham Norton, HRB 21284 (AG Nuernberg)
>> >> >>>>>>>> >> >> >
>> >> >>>>>>>> >> >>
>> >> >>>>>>>> >> >>
>> >> >>>>>>>> >> >
>> >> >>>>>>>> >> > --
>> >> >>>>>>>> >> > Richard Biener <rguenther@suse.de>
>> >> >>>>>>>> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>> >> >>>>>>>> >>
>> >> >>>>>>>> >
>> >> >>>>>>>> > --
>> >> >>>>>>>> > Richard Biener <rguenther@suse.de>
>> >> >>>>>>>> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>> >> >>>>>>>>
>> >> >>>>>>>
>> >> >>>>>>> --
>> >> >>>>>>> Richard Biener <rguenther@suse.de>
>> >> >>>>>>> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>> >> >>>>>
>> >> >>>>>
>> >>
>> >
>> > --
>> > Richard Biener <rguenther@suse.de>
>> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
Richard Biener Dec. 1, 2016, 2:45 p.m. UTC | #2
On Thu, 1 Dec 2016, Yuri Rumyantsev wrote:

> Thanks Richard for your comments.
> 
> You asked me about possible performance improvements for AVX2 machines
> - we did not see any visible speed-up for spec2k with any method of

Spec 2000?  Can you check with SPEC 2006 or CPUv6?

Did you see performance degradation?  What about compile-time and
binary size effects?

> masking, including epilogue masking and combining, only on AVX512
> machine aka knl.

I see.

Note that as said in the initial review patch the cost model I
saw therein looked flawed.  In the end I'd expect a sensible
approach would be to do

 if (n < scalar-most-profitable-niter)
   {
     no vectorization
   }
 else if (n < masking-more-profitable-than-not-masking-plus-epilogue)
   {
     do masked vectorization
   }
 else
   {
     do unmasked vectorization (with epilogue, eventually vectorized)
   }

where for short trip loops the else path would never be taken
(statically).

And yes, that means masking will only be useful for short-trip loops
which in the end means an overall performance benfit is unlikely
unless we have a lot of short-trip loops that are slow because of
the overhead of main unmasked loop plus epilogue.

Richard.

> I will answer on your question later.
> 
> Best regards.
> Yuri
> 
> 2016-12-01 14:33 GMT+03:00 Richard Biener <rguenther@suse.de>:
> > On Mon, 28 Nov 2016, Yuri Rumyantsev wrote:
> >
> >> Richard!
> >>
> >> I attached vect dump for hte part of attached test-case which
> >> illustrated how vectorization of epilogues works through masking:
> >> #define SIZE 1023
> >> #define ALIGN 64
> >>
> >> extern int posix_memalign(void **memptr, __SIZE_TYPE__ alignment,
> >> __SIZE_TYPE__ size) __attribute__((weak));
> >> extern void free (void *);
> >>
> >> void __attribute__((noinline))
> >> test_citer (int * __restrict__ a,
> >>    int * __restrict__ b,
> >>    int * __restrict__ c)
> >> {
> >>   int i;
> >>
> >>   a = (int *)__builtin_assume_aligned (a, ALIGN);
> >>   b = (int *)__builtin_assume_aligned (b, ALIGN);
> >>   c = (int *)__builtin_assume_aligned (c, ALIGN);
> >>
> >>   for (i = 0; i < SIZE; i++)
> >>     c[i] = a[i] + b[i];
> >> }
> >>
> >> It was compiled with -mavx2 --param vect-epilogues-mask=1 options.
> >>
> >> I did not include in this patch vectorization of low trip-count loops
> >> since in the original patch additional parameter was introduced:
> >> +DEFPARAM (PARAM_VECT_SHORT_LOOPS,
> >> +  "vect-short-loops",
> >> +  "Enable vectorization of low trip count loops using masking.",
> >> +  0, 0, 1)
> >>
> >> I assume that this ability can be included very quickly but it
> >> requires cost model enhancements also.
> >
> > Comments on the patch itself (as I'm having a closer look again,
> > I know how it vectorizes the above but I wondered why epilogue
> > and short-trip loops are not basically the same code path).
> >
> > Btw, I don't like that the features are behind a --param paywall.
> > That just means a) nobody will use it, b) it will bit-rot quickly,
> > c) bugs are well-hidden.
> >
> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
> > +      && integer_zerop (nested_in_vect_loop
> > +                       ? STMT_VINFO_DR_STEP (stmt_info)
> > +                       : DR_STEP (dr)))
> > +    {
> > +      if (dump_enabled_p ())
> > +       dump_printf_loc (MSG_NOTE, vect_location,
> > +                        "allow invariant load for masked loop.\n");
> > +    }
> >
> > this can test memory_access_type == VMAT_INVARIANT.  Please put
> > all the checks in a common
> >
> >   if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> >     {
> >        if (memory_access_type == VMAT_INVARIANT)
> >          {
> >          }
> >        else if (...)
> >          {
> >             LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> >          }
> >        else if (..)
> > ...
> >     }
> >
> > @@ -6667,6 +6756,15 @@ vectorizable_load (gimple *stmt,
> > gimple_stmt_iterator *gsi, gimple **vec_stmt,
> >        gcc_assert (!nested_in_vect_loop);
> >        gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
> >
> > +      if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> > +       {
> > +         if (dump_enabled_p ())
> > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                            "cannot be masked: grouped access is not"
> > +                            " supported.");
> > +         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> > +      }
> > +
> >
> > isn't this already handled by the above?  Or rather the general
> > disallowance of SLP?
> >
> > @@ -5730,6 +5792,24 @@ vectorizable_store (gimple *stmt,
> > gimple_stmt_iterator *gsi, gimple **vec_stmt,
> >                             &memory_access_type, &gs_info))
> >      return false;
> >
> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
> > +      && memory_access_type != VMAT_CONTIGUOUS)
> > +    {
> > +      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> > +      if (dump_enabled_p ())
> > +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                        "cannot be masked: unsupported memory access
> > type.\n");
> > +    }
> > +
> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
> > +      && !can_mask_load_store (stmt))
> > +    {
> > +      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> > +      if (dump_enabled_p ())
> > +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                        "cannot be masked: unsupported masked store.\n");
> > +    }
> > +
> >
> > likewise please combine the ifs.
> >
> > @@ -2354,7 +2401,10 @@ vectorizable_mask_load_store (gimple *stmt,
> > gimple_stmt_iterator *gsi,
> >                                           ptr, vec_mask, vec_rhs);
> >           vect_finish_stmt_generation (stmt, new_stmt, gsi);
> >           if (i == 0)
> > -           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> > +           {
> > +             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> > +             STMT_VINFO_FIRST_COPY_P (vinfo_for_stmt (new_stmt)) = true;
> > +           }
> >           else
> >             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
> >           prev_stmt_info = vinfo_for_stmt (new_stmt);
> >
> > here you only set the flag, elsewhere you copy DR and VECTYPE as well.
> >
> > @@ -2113,6 +2146,20 @@ vectorizable_mask_load_store (gimple *stmt,
> > gimple_stmt_iterator *gsi,
> >                && !useless_type_conversion_p (vectype, rhs_vectype)))
> >      return false;
> >
> > +  if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> > +    {
> > +      /* Check that mask conjuction is supported.  */
> > +      optab tab;
> > +      tab = optab_for_tree_code (BIT_AND_EXPR, vectype, optab_default);
> > +      if (!tab || optab_handler (tab, TYPE_MODE (vectype)) ==
> > CODE_FOR_nothing)
> > +       {
> > +         if (dump_enabled_p ())
> > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                            "cannot be masked: unsupported mask
> > operation\n");
> > +         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
> > +       }
> > +    }
> >
> > does this really test whether we can bit-and the mask?  You are
> > using the vector type of the store (which might be V2DF for example),
> > also for AVX512 it might be a vector-bool type with integer mode?
> > Of course we maybe can simply assume mask conjunction is available
> > (I know no ISA where that would be not true).
> >
> > +/* Return true if STMT can be converted to masked form.  */
> > +
> > +static bool
> > +can_mask_load_store (gimple *stmt)
> > +{
> > +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> > +  tree vectype, mask_vectype;
> > +  tree lhs, ref;
> > +
> > +  if (!stmt_info)
> > +    return false;
> > +  lhs = gimple_assign_lhs (stmt);
> > +  ref = (TREE_CODE (lhs) == SSA_NAME) ? gimple_assign_rhs1 (stmt) : lhs;
> > +  if (may_be_nonaddressable_p (ref))
> > +    return false;
> > +  vectype = STMT_VINFO_VECTYPE (stmt_info);
> >
> > You probably modeled this after ifcvt_can_use_mask_load_store but I
> > don't think checking may_be_nonaddressable_p is necessary (we couldn't
> > even vectorize such refs).  stmt_info should never be NULL either.
> > With the check removed tree-ssa-loop-ivopts.h should no longer be
> > necessary.
> >
> > +static void
> > +vect_mask_load_store_stmt (gimple *stmt, tree vectype, tree mask,
> > +                          data_reference *dr, gimple_stmt_iterator *si)
> > +{
> > ...
> > +  addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (mem),
> > +                                  true, NULL_TREE, true,
> > +                                  GSI_SAME_STMT);
> > +
> > +  align = TYPE_ALIGN_UNIT (vectype);
> > +  if (aligned_access_p (dr))
> > +    misalign = 0;
> > +  else if (DR_MISALIGNMENT (dr) == -1)
> > +    {
> > +      align = TYPE_ALIGN_UNIT (elem_type);
> > +      misalign = 0;
> > +    }
> > +  else
> > +    misalign = DR_MISALIGNMENT (dr);
> > +  set_ptr_info_alignment (get_ptr_info (addr), align, misalign);
> > +  ptr = build_int_cst (reference_alias_ptr_type (mem),
> > +                      misalign ? misalign & -misalign : align);
> >
> > you should simply use
> >
> >   align = get_object_alignment (mem) / BITS_PER_UNIT;
> >
> > here rather than trying to be clever.  Eventually you don't need
> > the DR then (see question above).
> >
> > +    }
> > +  gsi_replace (si ? si : &gsi, new_stmt, false);
> >
> > when you replace the load/store please previously copy VUSE and VDEF
> > from the original one (we were nearly clean enough to no longer
> > require a virtual operand rewrite after vectorization...)  Thus
> >
> >   gimple_set_vuse (new_stmt, gimple_vuse (stmt));
> >   gimple_set_vdef (new_stmt, gimple_vdef (stmt));
> >
> > +static void
> > +vect_mask_loop (loop_vec_info loop_vinfo)
> > +{
> > ...
> > +  /* Scan all loop statements to convert vector load/store including
> > masked
> > +     form.  */
> > +  for (unsigned i = 0; i < loop->num_nodes; i++)
> > +    {
> > +      basic_block bb = bbs[i];
> > +      for (gimple_stmt_iterator si = gsi_start_bb (bb);
> > +          !gsi_end_p (si); gsi_next (&si))
> > +       {
> > +         gimple *stmt = gsi_stmt (si);
> > +         stmt_vec_info stmt_info = NULL;
> > +         tree vectype = NULL;
> > +         data_reference *dr;
> > +
> > +         /* Mask load case.  */
> > +         if (is_gimple_call (stmt)
> > +             && gimple_call_internal_p (stmt)
> > +             && gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
> > +             && !VECTOR_TYPE_P (TREE_TYPE (gimple_call_arg (stmt, 2))))
> > +           {
> > ...
> > +             /* Skip invariant loads.  */
> > +             if (integer_zerop (nested_in_vect_loop_p (loop, stmt)
> > +                                ? STMT_VINFO_DR_STEP (stmt_info)
> > +                                : DR_STEP (STMT_VINFO_DATA_REF
> > (stmt_info))))
> > +               continue;
> >
> > seeing this it would be nice if stmt_info had a flag for whether
> > the stmt needs masking (and a flag on wheter this is a scalar or a
> > vectorized stmt).
> >
> > +         /* Skip hoisted out statements.  */
> > +         if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
> > +           continue;
> >
> > err, you walk stmts in the loop!  Isn't this covered by the above
> > skipping of 'invariant loads'?
> >
> > +static gimple *
> > +vect_mask_reduction_stmt (gimple *stmt, tree mask, gimple *prev)
> > +{
> >
> > depending on the reduction operand there are variants that
> > could get away w/o the VEC_COND_EXPR, like
> >
> >   S1': tem_4 = d_3 & MASK;
> >   S2': r_1 = r_2 + tem_4;
> >
> > which works for plus at least.  More generally doing
> >
> >   S1': tem_4 = VEC_COND_EXPR<MASK, d_3, neutral operand>
> >   S2': r_1 = r_2 OP tem_4;
> >
> > and leaving optimization to & to later opts (& won't work for
> > AVX512 mask registers I guess).
> >
> > Good enough for later enhacement of course.
> >
> > +static void
> > +vect_gen_ivs_for_masking (loop_vec_info loop_vinfo, vec<tree> *ivs)
> > +{
> > ...
> >
> > isn't it enough to always create a single IV and derive the
> > additional copies by IV + i * { elems, elems, elems ... }?
> > IVs are expensive -- I'm sure we can optimize the rest of the
> > scheme further as well but this one looks obvious to me.
> >
> > @@ -3225,12 +3508,32 @@ vect_estimate_min_profitable_iters (loop_vec_info
> > loop_vinfo,
> >    int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
> >    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
> >
> > +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> > +    {
> > +      /* Currently we don't produce scalar epilogue version in case
> > +        its masked version is provided.  It means we don't need to
> > +        compute profitability one more time here.  Just make a
> > +        masked loop version.  */
> > +      if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
> > +         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
> > +       {
> > +         dump_printf_loc (MSG_NOTE, vect_location,
> > +                          "cost model: mask loop epilogue.\n");
> > +         LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
> > +         *ret_min_profitable_niters = 0;
> > +         *ret_min_profitable_estimate = 0;
> > +         return;
> > +       }
> > +    }
> >    /* Cost model disabled.  */
> > -  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
> > +  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
> >      {
> >        dump_printf_loc (MSG_NOTE, vect_location, "cost model
> > disabled.\n");
> >        *ret_min_profitable_niters = 0;
> >        *ret_min_profitable_estimate = 0;
> > +      if (PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK)
> > +         && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> > +       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
> >        return;
> >      }
> >
> > the unlimited_cost_model case should come first?  OTOH masking or
> > not is probably not sth covered by 'unlimited' - that is about
> > vectorizing or not.  But the above code means that for
> > epilogue vectorization w/o masking we ignore unlimited_cost_model ()?
> > That doesn't make sense to me.
> >
> > Plus if this is short-trip or epilogue vectorization and the
> > cost model is _not_ unlimited then we dont' want to enable
> > masking always (if it is possible).  It might be we statically
> > know the epilogue executes for at most two iterations for example.
> >
> > I don't see _any_ cost model for vectorizing the epilogue with
> > masking?  Am I missing something?  A "trivial" cost model
> > should at least consider the additional IV(s), the mask
> > compute and the widening and narrowing ops required.
> >
> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> > index e13d6a2..36be342 100644
> > --- a/gcc/tree-vect-loop-manip.c
> > +++ b/gcc/tree-vect-loop-manip.c
> > @@ -1635,6 +1635,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> > niters, tree nitersm1,
> >    bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
> >                          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
> >
> > +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> > +    {
> > +      prolog_peeling = false;
> > +      if (LOOP_VINFO_MASK_LOOP (loop_vinfo))
> > +       epilog_peeling = false;
> > +    }
> > +
> >    if (!prolog_peeling && !epilog_peeling)
> >      return NULL;
> >
> > I think the prolog_peeling was fixed during the epilogue vectorization
> > review and should no longer be necessary.  Please add
> > a && ! LOOP_VINFO_MASK_LOOP () to the epilog_peeling init instead
> > (it should also work for short-trip loop vectorization).
> >
> > @@ -2022,11 +2291,18 @@ start_over:
> >        || (max_niter != -1
> >           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
> >      {
> > -      if (dump_enabled_p ())
> > -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > -                        "not vectorized: iteration count smaller than "
> > -                        "vectorization factor.\n");
> > -      return false;
> > +      /* Allow low trip count for loop epilogue we want to mask.  */
> > +      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> > +         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
> > +       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
> > +      else
> > +       {
> > +         if (dump_enabled_p ())
> >
> > so why do we test only LOOP_VINFO_EPILOGUE_P here?  All the code
> > I saw sofar would also work for the main loop (but the cost
> > model is missing).
> >
> > I am missing testcases.  There's only a single one but we should
> > have cases covering all kinds of mask IV widths and widen/shorten
> > masks.
> >
> > Do you have any numbers on SPEC 2k6 with epilogue vect and/or masking
> > enabled for an AVX2 machine?
> >
> > Oh, and I really dislike the --param paywall.
> >
> > Thanks,
> > Richard.
> >
> >> Best regards.
> >> Yuri.
> >>
> >>
> >> 2016-11-28 17:39 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> > On Thu, 24 Nov 2016, Yuri Rumyantsev wrote:
> >> >
> >> >> Hi All,
> >> >>
> >> >> Here is the second patch which supports epilogue vectorization using
> >> >> masking without cost model. Currently it is possible
> >> >> only with passing parameter "--param vect-epilogues-mask=1".
> >> >>
> >> >> Bootstrapping and regression testing did not show any new regression.
> >> >>
> >> >> Any comments will be appreciated.
> >> >
> >> > Going over the patch the main question is one how it works -- it looks
> >> > like the decision whether to vectorize & mask the epilogue is made
> >> > when vectorizing the loop that generates the epilogue rather than
> >> > in the epilogue vectorization path?
> >> >
> >> > That is, I'd have expected to see this handling low-trip count loops
> >> > by masking?  And thus masking the epilogue simply by it being
> >> > low-trip count?
> >> >
> >> > Richard.
> >> >
> >> >> ChangeLog:
> >> >> 2016-11-24  Yuri Rumyantsev  <ysrumyan@gmail.com>
> >> >>
> >> >> * params.def (PARAM_VECT_EPILOGUES_MASK): New.
> >> >> * tree-vect-data-refs.c (vect_get_new_ssa_name): Support vect_mask_var.
> >> >> * tree-vect-loop.c: Include insn-config.h, recog.h and alias.h.
> >> >> (new_loop_vec_info): Add zeroing can_be_masked, mask_loop and
> >> >> required_mask fields.
> >> >> (vect_check_required_masks_widening): New.
> >> >> (vect_check_required_masks_narrowing): New.
> >> >> (vect_get_masking_iv_elems): New.
> >> >> (vect_get_masking_iv_type): New.
> >> >> (vect_get_extreme_masks): New.
> >> >> (vect_check_required_masks): New.
> >> >> (vect_analyze_loop_operations): Call vect_check_required_masks if all
> >> >> statements can be masked.
> >> >> (vect_analyze_loop_2): Inititalize to zero min_scalar_loop_bound.
> >> >> Add check that epilogue can be masked with the same vf with issue
> >> >> fail notes.  Allow epilogue vectorization through masking of low trip
> >> >> loops. Set to true can_be_masked field before loop operation analysis.
> >> >> Do not set-up min_scalar_loop_bound for epilogue vectorization through
> >> >> masking.  Do not peeling for epilogue masking.  Reset can_be_masked
> >> >> field before repeat analysis.
> >> >> (vect_estimate_min_profitable_iters): Do not compute profitability
> >> >> for epilogue masking.  Set up mask_loop filed to true if parameter
> >> >> PARAM_VECT_EPILOGUES_MASK is non-zero.
> >> >> (vectorizable_reduction): Add check that statement can be masked.
> >> >> (vectorizable_induction): Do not support masking for induction.
> >> >> (vect_gen_ivs_for_masking): New.
> >> >> (vect_get_mask_index_for_elems): New.
> >> >> (vect_get_mask_index_for_type): New.
> >> >> (vect_create_narrowed_masks): New.
> >> >> (vect_create_widened_masks): New.
> >> >> (vect_gen_loop_masks): New.
> >> >> (vect_mask_reduction_stmt): New.
> >> >> (vect_mask_mask_load_store_stmt): New.
> >> >> (vect_mask_load_store_stmt): New.
> >> >> (vect_mask_loop): New.
> >> >> (vect_transform_loop): Invoke vect_mask_loop if required.
> >> >> Use div_ceil to recompute upper bounds for masked loops.  Issue
> >> >> statistics for epilogue vectorization through masking. Do not reduce
> >> >> vf for masking epilogue.
> >> >> * tree-vect-stmts.c: Include tree-ssa-loop-ivopts.h.
> >> >> (can_mask_load_store): New.
> >> >> (vectorizable_mask_load_store): Check that mask conjuction is
> >> >> supported.  Set-up first_copy_p field of stmt_vinfo.
> >> >> (vectorizable_simd_clone_call): Check that simd clone can not be
> >> >> masked.
> >> >> (vectorizable_store): Check that store can be masked. Mark the first
> >> >> copy of generated vector stores and provide it with vectype and the
> >> >> original data reference.
> >> >> (vectorizable_load): Check that load can be masked.
> >> >> (vect_stmt_should_be_masked_for_epilogue): New.
> >> >> (vect_add_required_mask_for_stmt): New.
> >> >> (vect_analyze_stmt): Add check on unsupported statements for masking
> >> >> with printing message.
> >> >> * tree-vectorizer.h (struct _loop_vec_info): Add new fields
> >> >> can_be_maske, required_masks, masl_loop.
> >> >> (LOOP_VINFO_CAN_BE_MASKED): New.
> >> >> (LOOP_VINFO_REQUIRED_MASKS): New.
> >> >> (LOOP_VINFO_MASK_LOOP): New.
> >> >> (struct _stmt_vec_info): Add first_copy_p field.
> >> >> (STMT_VINFO_FIRST_COPY_P): New.
> >> >>
> >> >> gcc/testsuite/
> >> >>
> >> >> * gcc.dg/vect/vect-tail-mask-1.c: New test.
> >> >>
> >> >> 2016-11-18 18:54 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
> >> >> > On 18 November 2016 at 16:46, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> >> >> >> It is very strange that this test failed on arm, since it requires
> >> >> >> target avx2 to check vectorizer dumps:
> >> >> >>
> >> >> >> /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" {
> >> >> >> target avx2_runtime } } } */
> >> >> >> /* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED
> >> >> >> \\(VS=16\\)" 2 "vect" { target avx2_runtime } } } */
> >> >> >>
> >> >> >> Could you please clarify what is the reason of the failure?
> >> >> >
> >> >> > It's not the scan-dumps that fail, but the execution.
> >> >> > The test calls abort() for some reason.
> >> >> >
> >> >> > It will take me a while to rebuild the test manually in the right
> >> >> > debug environment to provide you with more traces.
> >> >> >
> >> >> >
> >> >> >
> >> >> >>
> >> >> >> Thanks.
> >> >> >>
> >> >> >> 2016-11-18 16:20 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
> >> >> >>> On 15 November 2016 at 15:41, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> >> >> >>>> Hi All,
> >> >> >>>>
> >> >> >>>> Here is patch for non-masked epilogue vectoriziation.
> >> >> >>>>
> >> >> >>>> Bootstrap and regression testing did not show any new failures.
> >> >> >>>>
> >> >> >>>> Is it OK for trunk?
> >> >> >>>>
> >> >> >>>> Thanks.
> >> >> >>>> Changelog:
> >> >> >>>>
> >> >> >>>> 2016-11-15  Yuri Rumyantsev  <ysrumyan@gmail.com>
> >> >> >>>>
> >> >> >>>> * params.def (PARAM_VECT_EPILOGUES_NOMASK): New.
> >> >> >>>> * tree-if-conv.c (tree_if_conversion): Make public.
> >> >> >>>> * * tree-if-conv.h: New file.
> >> >> >>>> * tree-vect-data-refs.c (vect_analyze_data_ref_dependences) Avoid
> >> >> >>>> dynamic alias checks for epilogues.
> >> >> >>>> * tree-vect-loop-manip.c (vect_do_peeling): Return created epilog.
> >> >> >>>> * tree-vect-loop.c: include tree-if-conv.h.
> >> >> >>>> (new_loop_vec_info): Add zeroing orig_loop_info field.
> >> >> >>>> (vect_analyze_loop_2): Don't try to enhance alignment for epilogues.
> >> >> >>>> (vect_analyze_loop): Add argument ORIG_LOOP_INFO which is not NULL
> >> >> >>>> if epilogue is vectorized, set up orig_loop_info field of loop_vinfo
> >> >> >>>> using passed argument.
> >> >> >>>> (vect_transform_loop): Check if created epilogue should be returned
> >> >> >>>> for further vectorization with less vf.  If-convert epilogue if
> >> >> >>>> required. Print vectorization success for epilogue.
> >> >> >>>> * tree-vectorizer.c (vectorize_loops): Add epilogue vectorization
> >> >> >>>> if it is required, pass loop_vinfo produced during vectorization of
> >> >> >>>> loop body to vect_analyze_loop.
> >> >> >>>> * tree-vectorizer.h (struct _loop_vec_info): Add new field
> >> >> >>>> orig_loop_info.
> >> >> >>>> (LOOP_VINFO_ORIG_LOOP_INFO): New.
> >> >> >>>> (LOOP_VINFO_EPILOGUE_P): New.
> >> >> >>>> (LOOP_VINFO_ORIG_VECT_FACTOR): New.
> >> >> >>>> (vect_do_peeling): Change prototype to return epilogue.
> >> >> >>>> (vect_analyze_loop): Add argument of loop_vec_info type.
> >> >> >>>> (vect_transform_loop): Return created loop.
> >> >> >>>>
> >> >> >>>> gcc/testsuite/
> >> >> >>>>
> >> >> >>>> * lib/target-supports.exp (check_avx2_hw_available): New.
> >> >> >>>> (check_effective_target_avx2_runtime): New.
> >> >> >>>> * gcc.dg/vect/vect-tail-nomask-1.c: New test.
> >> >> >>>>
> >> >> >>>
> >> >> >>> Hi,
> >> >> >>>
> >> >> >>> This new test fails on arm-none-eabi (using default cpu/fpu/mode):
> >> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c -flto -ffat-lto-objects execution test
> >> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c execution test
> >> >> >>>
> >> >> >>> It does pass on the same target if configured --with-cpu=cortex-a9.
> >> >> >>>
> >> >> >>> Christophe
> >> >> >>>
> >> >> >>>
> >> >> >>>
> >> >> >>>>
> >> >> >>>> 2016-11-14 20:04 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> >> >>>>> On November 14, 2016 4:39:40 PM GMT+01:00, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> >> >> >>>>>>Richard,
> >> >> >>>>>>
> >> >> >>>>>>I checked one of the tests designed for epilogue vectorization using
> >> >> >>>>>>patches 1 - 3 and found out that build compiler performs vectorization
> >> >> >>>>>>of epilogues with --param vect-epilogues-nomask=1 passed:
> >> >> >>>>>>
> >> >> >>>>>>$ gcc -Ofast -mavx2 t1.c -S --param vect-epilogues-nomask=1 -o
> >> >> >>>>>>t1.new-nomask.s -fdump-tree-vect-details
> >> >> >>>>>>$ grep VECTORIZED -c t1.c.156t.vect
> >> >> >>>>>>4
> >> >> >>>>>> Without param only 2 loops are vectorized.
> >> >> >>>>>>
> >> >> >>>>>>Should I simply add a part of tests related to this feature or I must
> >> >> >>>>>>delete all not necessary changes also?
> >> >> >>>>>
> >> >> >>>>> Please remove all not necessary changes.
> >> >> >>>>>
> >> >> >>>>> Richard.
> >> >> >>>>>
> >> >> >>>>>>Thanks.
> >> >> >>>>>>Yuri.
> >> >> >>>>>>
> >> >> >>>>>>2016-11-14 16:40 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> >> >>>>>>> On Mon, 14 Nov 2016, Yuri Rumyantsev wrote:
> >> >> >>>>>>>
> >> >> >>>>>>>> Richard,
> >> >> >>>>>>>>
> >> >> >>>>>>>> In my previous patch I forgot to remove couple lines related to aux
> >> >> >>>>>>field.
> >> >> >>>>>>>> Here is the correct updated patch.
> >> >> >>>>>>>
> >> >> >>>>>>> Yeah, I noticed.  This patch would be ok for trunk (together with
> >> >> >>>>>>> necessary parts from 1 and 2) if all not required parts are removed
> >> >> >>>>>>> (and you'd add the testcases covering non-masked tail vect).
> >> >> >>>>>>>
> >> >> >>>>>>> Thus, can you please produce a single complete patch containing only
> >> >> >>>>>>> non-masked epilogue vectoriziation?
> >> >> >>>>>>>
> >> >> >>>>>>> Thanks,
> >> >> >>>>>>> Richard.
> >> >> >>>>>>>
> >> >> >>>>>>>> Thanks.
> >> >> >>>>>>>> Yuri.
> >> >> >>>>>>>>
> >> >> >>>>>>>> 2016-11-14 15:51 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> >> >>>>>>>> > On Fri, 11 Nov 2016, Yuri Rumyantsev wrote:
> >> >> >>>>>>>> >
> >> >> >>>>>>>> >> Richard,
> >> >> >>>>>>>> >>
> >> >> >>>>>>>> >> I prepare updated 3 patch with passing additional argument to
> >> >> >>>>>>>> >> vect_analyze_loop as you proposed (untested).
> >> >> >>>>>>>> >>
> >> >> >>>>>>>> >> You wrote:
> >> >> >>>>>>>> >> tw, I wonder if you can produce a single patch containing just
> >> >> >>>>>>>> >> epilogue vectorization, that is combine patches 1-3 but rip out
> >> >> >>>>>>>> >> changes only needed by later patches?
> >> >> >>>>>>>> >>
> >> >> >>>>>>>> >> Did you mean that I exclude all support for vectorization
> >> >> >>>>>>epilogues,
> >> >> >>>>>>>> >> i.e. exclude from 2-nd patch all non-related changes
> >> >> >>>>>>>> >> like
> >> >> >>>>>>>> >>
> >> >> >>>>>>>> >> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> >> >> >>>>>>>> >> index 11863af..32011c1 100644
> >> >> >>>>>>>> >> --- a/gcc/tree-vect-loop.c
> >> >> >>>>>>>> >> +++ b/gcc/tree-vect-loop.c
> >> >> >>>>>>>> >> @@ -1120,6 +1120,12 @@ new_loop_vec_info (struct loop *loop)
> >> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
> >> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_NITER (res) = false;
> >> >> >>>>>>>> >>    LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
> >> >> >>>>>>>> >> +  LOOP_VINFO_CAN_BE_MASKED (res) = false;
> >> >> >>>>>>>> >> +  LOOP_VINFO_REQUIRED_MASKS (res) = 0;
> >> >> >>>>>>>> >> +  LOOP_VINFO_COMBINE_EPILOGUE (res) = false;
> >> >> >>>>>>>> >> +  LOOP_VINFO_MASK_EPILOGUE (res) = false;
> >> >> >>>>>>>> >> +  LOOP_VINFO_NEED_MASKING (res) = false;
> >> >> >>>>>>>> >> +  LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL;
> >> >> >>>>>>>> >
> >> >> >>>>>>>> > Yes.
> >> >> >>>>>>>> >
> >> >> >>>>>>>> >> Did you mean also that new combined patch must be working patch,
> >> >> >>>>>>i.e.
> >> >> >>>>>>>> >> can be integrated without other patches?
> >> >> >>>>>>>> >
> >> >> >>>>>>>> > Yes.
> >> >> >>>>>>>> >
> >> >> >>>>>>>> >> Could you please look at updated patch?
> >> >> >>>>>>>> >
> >> >> >>>>>>>> > Will do.
> >> >> >>>>>>>> >
> >> >> >>>>>>>> > Thanks,
> >> >> >>>>>>>> > Richard.
> >> >> >>>>>>>> >
> >> >> >>>>>>>> >> Thanks.
> >> >> >>>>>>>> >> Yuri.
> >> >> >>>>>>>> >>
> >> >> >>>>>>>> >> 2016-11-10 15:36 GMT+03:00 Richard Biener <rguenther@suse.de>:
> >> >> >>>>>>>> >> > On Thu, 10 Nov 2016, Richard Biener wrote:
> >> >> >>>>>>>> >> >
> >> >> >>>>>>>> >> >> On Tue, 8 Nov 2016, Yuri Rumyantsev wrote:
> >> >> >>>>>>>> >> >>
> >> >> >>>>>>>> >> >> > Richard,
> >> >> >>>>>>>> >> >> >
> >> >> >>>>>>>> >> >> > Here is updated 3 patch.
> >> >> >>>>>>>> >> >> >
> >> >> >>>>>>>> >> >> > I checked that all new tests related to epilogue
> >> >> >>>>>>vectorization passed with it.
> >> >> >>>>>>>> >> >> >
> >> >> >>>>>>>> >> >> > Your comments will be appreciated.
> >> >> >>>>>>>> >> >>
> >> >> >>>>>>>> >> >> A lot better now.  Instead of the ->aux dance I now prefer to
> >> >> >>>>>>>> >> >> pass the original loops loop_vinfo to vect_analyze_loop as
> >> >> >>>>>>>> >> >> optional argument (if non-NULL we analyze the epilogue of that
> >> >> >>>>>>>> >> >> loop_vinfo).  OTOH I remember we mainly use it to get at the
> >> >> >>>>>>>> >> >> original vectorization factor?  So we can pass down an
> >> >> >>>>>>(optional)
> >> >> >>>>>>>> >> >> forced vectorization factor as well?
> >> >> >>>>>>>> >> >
> >> >> >>>>>>>> >> > Btw, I wonder if you can produce a single patch containing just
> >> >> >>>>>>>> >> > epilogue vectorization, that is combine patches 1-3 but rip out
> >> >> >>>>>>>> >> > changes only needed by later patches?
> >> >> >>>>>>>> >> >
> >> >> >>>>>>>> >> > Thanks,
> >> >> >>>>>>>> >> > Richard.
> >> >> >>>>>>>> >> >
> >> >> >>>>>>>> >> >> Richard.
> >> >> >>>>>>>> >> >>
> >> >> >>>>>>>> >> >> > 2016-11-08 15:38 GMT+03:00 Richard Biener
> >> >> >>>>>><rguenther@suse.de>:
> >> >> >>>>>>>> >> >> > > On Thu, 3 Nov 2016, Yuri Rumyantsev wrote:
> >> >> >>>>>>>> >> >> > >
> >> >> >>>>>>>> >> >> > >> Hi Richard,
> >> >> >>>>>>>> >> >> > >>
> >> >> >>>>>>>> >> >> > >> I did not understand your last remark:
> >> >> >>>>>>>> >> >> > >>
> >> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
> >> >> >>>>>>>> >> >> > >> >           && dump_enabled_p ())
> >> >> >>>>>>>> >> >> > >> >           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
> >> >> >>>>>>vect_location,
> >> >> >>>>>>>> >> >> > >> >                            "loop vectorized\n");
> >> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
> >> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
> >> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
> >> >> >>>>>>>> >> >> > >> >        /* Now that the loop has been vectorized, allow
> >> >> >>>>>>it to be unrolled
> >> >> >>>>>>>> >> >> > >> >           etc.  */
> >> >> >>>>>>>> >> >> > >> >      loop->force_vectorize = false;
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
> >> >> >>>>>>it easier
> >> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
> >> >> >>>>>>in dumps
> >> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
> >> >> >>>>>>*/
> >> >> >>>>>>>> >> >> > >> > +       if (new_loop)
> >> >> >>>>>>>> >> >> > >> > +         {
> >> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
> >> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
> >> >> >>>>>>>> >> >> > >> > +         }
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
> >> >> >>>>>>new_loop)
> >> >> >>>>>>>> >> >> > >> f> unction which will set up stuff properly (and also
> >> >> >>>>>>perform
> >> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
> >> >> >>>>>>vectorization
> >> >> >>>>>>>> >> >> > >> > separately that would be great.
> >> >> >>>>>>>> >> >> > >>
> >> >> >>>>>>>> >> >> > >> Could you please clarify your proposal.
> >> >> >>>>>>>> >> >> > >
> >> >> >>>>>>>> >> >> > > When a loop was vectorized set things up to immediately
> >> >> >>>>>>vectorize
> >> >> >>>>>>>> >> >> > > its epilogue, avoiding changing the loop iteration and
> >> >> >>>>>>avoiding
> >> >> >>>>>>>> >> >> > > the re-use of ->aux.
> >> >> >>>>>>>> >> >> > >
> >> >> >>>>>>>> >> >> > > Richard.
> >> >> >>>>>>>> >> >> > >
> >> >> >>>>>>>> >> >> > >> Thanks.
> >> >> >>>>>>>> >> >> > >> Yuri.
> >> >> >>>>>>>> >> >> > >>
> >> >> >>>>>>>> >> >> > >> 2016-11-02 15:27 GMT+03:00 Richard Biener
> >> >> >>>>>><rguenther@suse.de>:
> >> >> >>>>>>>> >> >> > >> > On Tue, 1 Nov 2016, Yuri Rumyantsev wrote:
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> >> Hi All,
> >> >> >>>>>>>> >> >> > >> >>
> >> >> >>>>>>>> >> >> > >> >> I re-send all patches sent by Ilya earlier for review
> >> >> >>>>>>which support
> >> >> >>>>>>>> >> >> > >> >> vectorization of loop epilogues and loops with low
> >> >> >>>>>>trip count. We
> >> >> >>>>>>>> >> >> > >> >> assume that the only patch -
> >> >> >>>>>>vec-tails-07-combine-tail.patch - was not
> >> >> >>>>>>>> >> >> > >> >> approved by Jeff.
> >> >> >>>>>>>> >> >> > >> >>
> >> >> >>>>>>>> >> >> > >> >> I did re-base of all patches and performed
> >> >> >>>>>>bootstrapping and
> >> >> >>>>>>>> >> >> > >> >> regression testing that did not show any new failures.
> >> >> >>>>>>Also all
> >> >> >>>>>>>> >> >> > >> >> changes related to new vect_do_peeling algorithm have
> >> >> >>>>>>been changed
> >> >> >>>>>>>> >> >> > >> >> accordingly.
> >> >> >>>>>>>> >> >> > >> >>
> >> >> >>>>>>>> >> >> > >> >> Is it OK for trunk?
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > I would have prefered that the series up to
> >> >> >>>>>>-03-nomask-tails would
> >> >> >>>>>>>> >> >> > >> > _only_ contain epilogue loop vectorization changes but
> >> >> >>>>>>unfortunately
> >> >> >>>>>>>> >> >> > >> > the patchset is oddly separated.
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > I have a comment on that part nevertheless:
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > @@ -1608,7 +1614,10 @@ vect_enhance_data_refs_alignment
> >> >> >>>>>>(loop_vec_info
> >> >> >>>>>>>> >> >> > >> > loop_vinfo)
> >> >> >>>>>>>> >> >> > >> >    /* Check if we can possibly peel the loop.  */
> >> >> >>>>>>>> >> >> > >> >    if (!vect_can_advance_ivs_p (loop_vinfo)
> >> >> >>>>>>>> >> >> > >> >        || !slpeel_can_duplicate_loop_p (loop,
> >> >> >>>>>>single_exit (loop))
> >> >> >>>>>>>> >> >> > >> > -      || loop->inner)
> >> >> >>>>>>>> >> >> > >> > +      || loop->inner
> >> >> >>>>>>>> >> >> > >> > +      /* Required peeling was performed in prologue
> >> >> >>>>>>and
> >> >> >>>>>>>> >> >> > >> > +        is not required for epilogue.  */
> >> >> >>>>>>>> >> >> > >> > +      || LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> >> >> >>>>>>>> >> >> > >> >      do_peeling = false;
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> >    if (do_peeling
> >> >> >>>>>>>> >> >> > >> > @@ -1888,7 +1897,10 @@ vect_enhance_data_refs_alignment
> >> >> >>>>>>(loop_vec_info
> >> >> >>>>>>>> >> >> > >> > loop_vinfo)
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> >    do_versioning =
> >> >> >>>>>>>> >> >> > >> >         optimize_loop_nest_for_speed_p (loop)
> >> >> >>>>>>>> >> >> > >> > -       && (!loop->inner); /* FORNOW */
> >> >> >>>>>>>> >> >> > >> > +       && (!loop->inner) /* FORNOW */
> >> >> >>>>>>>> >> >> > >> > +        /* Required versioning was performed for the
> >> >> >>>>>>>> >> >> > >> > +          original loop and is not required for
> >> >> >>>>>>epilogue.  */
> >> >> >>>>>>>> >> >> > >> > +       && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> >    if (do_versioning)
> >> >> >>>>>>>> >> >> > >> >      {
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > please do that check in the single caller of this
> >> >> >>>>>>function.
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > Otherwise I still dislike the new ->aux use and I
> >> >> >>>>>>believe that simply
> >> >> >>>>>>>> >> >> > >> > passing down info from the processed parent would be
> >> >> >>>>>>_much_ cleaner.
> >> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
> >> >> >>>>>>>> >> >> > >> >             && dump_enabled_p ())
> >> >> >>>>>>>> >> >> > >> >            dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
> >> >> >>>>>>vect_location,
> >> >> >>>>>>>> >> >> > >> >                             "loop vectorized\n");
> >> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
> >> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
> >> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
> >> >> >>>>>>>> >> >> > >> >         /* Now that the loop has been vectorized, allow
> >> >> >>>>>>it to be unrolled
> >> >> >>>>>>>> >> >> > >> >            etc.  */
> >> >> >>>>>>>> >> >> > >> >         loop->force_vectorize = false;
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
> >> >> >>>>>>it easier
> >> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
> >> >> >>>>>>in dumps
> >> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
> >> >> >>>>>>*/
> >> >> >>>>>>>> >> >> > >> > +       if (new_loop)
> >> >> >>>>>>>> >> >> > >> > +         {
> >> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
> >> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
> >> >> >>>>>>>> >> >> > >> > +         }
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
> >> >> >>>>>>new_loop)
> >> >> >>>>>>>> >> >> > >> > function which will set up stuff properly (and also
> >> >> >>>>>>perform
> >> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
> >> >> >>>>>>vectorization
> >> >> >>>>>>>> >> >> > >> > separately that would be great.
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > I'm still torn about all the rest of the stuff and
> >> >> >>>>>>question its
> >> >> >>>>>>>> >> >> > >> > usability (esp. merging the epilogue with the main
> >> >> >>>>>>vector loop).
> >> >> >>>>>>>> >> >> > >> > But it has already been approved ... oh well.
> >> >> >>>>>>>> >> >> > >> >
> >> >> >>>>>>>> >> >> > >> > Thanks,
> >> >> >>>>>>>> >> >> > >> > Richard.
> >> >> >>>>>>>> >> >> > >>
> >> >> >>>>>>>> >> >> > >>
> >> >> >>>>>>>> >> >> > >
> >> >> >>>>>>>> >> >> > > --
> >> >> >>>>>>>> >> >> > > Richard Biener <rguenther@suse.de>
> >> >> >>>>>>>> >> >> > > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard,
> >> >> >>>>>>Graham Norton, HRB 21284 (AG Nuernberg)
> >> >> >>>>>>>> >> >> >
> >> >> >>>>>>>> >> >>
> >> >> >>>>>>>> >> >>
> >> >> >>>>>>>> >> >
> >> >> >>>>>>>> >> > --
> >> >> >>>>>>>> >> > Richard Biener <rguenther@suse.de>
> >> >> >>>>>>>> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
> >> >> >>>>>>>> >>
> >> >> >>>>>>>> >
> >> >> >>>>>>>> > --
> >> >> >>>>>>>> > Richard Biener <rguenther@suse.de>
> >> >> >>>>>>>> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
> >> >> >>>>>>>>
> >> >> >>>>>>>
> >> >> >>>>>>> --
> >> >> >>>>>>> Richard Biener <rguenther@suse.de>
> >> >> >>>>>>> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
> >> >> >>>>>
> >> >> >>>>>
> >> >>
> >> >
> >> > --
> >> > Richard Biener <rguenther@suse.de>
> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
> >>
> >
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
> 
>
Yuri Rumyantsev Dec. 21, 2016, 10:12 a.m. UTC | #3
Hi Richard,

I occasionally found out a bug in my patch related to epilogue
vectorization without masking : need to put label before
initialization.

Could you please review and integrate it to trunk. Test-case is also attached.


Thanks ahead.
Yuri.

ChangeLog:
2016-12-21  Yuri Rumyantsev  <ysrumyan@gmail.com>

* tree-vectorizer.c (vectorize_loops): Put label before initialization
of loop_vectorized_call.

gcc/testsuite/

* gcc.dg/vect/vect-tail-nomask-2.c: New test.

2016-12-13 16:59 GMT+03:00 Richard Biener <rguenther@suse.de>:
> On Mon, 12 Dec 2016, Yuri Rumyantsev wrote:
>
>> Richard,
>>
>> Could you please review cost model patch before to include it to
>> epilogue masking patch and add masking cost estimation as you
>> requested.
>
> That's just the middle-end / target changes.  I was not 100% happy
> with them but well, the vectorizer cost modeling needs work
> (aka another rewrite).
>
> From below...
>
>> Thanks.
>>
>> Patch and ChangeLog are attached.
>>
>> 2016-12-12 15:47 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>> > Hi Richard,
>> >
>> > You asked me about performance of spec2006 on AVX2 machine with new feature.
>> >
>> > I tried the following on Haswell using original patch designed by Ilya.
>> > 1. Masking low trip count loops  only 6 benchmarks are affected and
>> > performance is almost the same
>> > 464.h264ref     63.9000    64.0000 +0.15%
>> > 416.gamess      42.9000    42.9000 +0%
>> > 435.gromacs     32.8000    32.7000 -0.30%
>> > 447.dealII      68.5000    68.3000 -0.29%
>> > 453.povray      61.9000    62.1000 +0.32%
>> > 454.calculix    39.8000    39.8000 +0%
>> > 465.tonto       29.9000    29.9000 +0%
>> >
>> > 2. epilogue vectorization without masking (use less vf) (3 benchmarks
>> > are not affected)
>> > 400.perlbench     47.2000    46.5000 -1.48%
>> > 401.bzip2         29.9000    29.9000 +0%
>> > 403.gcc           41.8000    41.6000 -0.47%
>> > 456.hmmer         32.0000    32.0000 +0%
>> > 462.libquantum    81.5000    82.0000 +0.61%
>> > 464.h264ref       65.0000    65.5000 +0.76%
>> > 471.omnetpp       27.8000    28.2000 +1.43%
>> > 473.astar         28.7000    28.6000 -0.34%
>> > 483.xalancbmk     48.7000    48.6000 -0.20%
>> > 410.bwaves        95.3000    95.3000 +0%
>> > 416.gamess        42.9000    42.8000 -0.23%
>> > 433.milc          38.8000    38.8000 +0%
>> > 434.zeusmp        51.7000    51.4000 -0.58%
>> > 435.gromacs       32.8000    32.8000 +0%
>> > 436.cactusADM     85.0000    83.0000 -2.35%
>> > 437.leslie3d      55.5000    55.5000 +0%
>> > 444.namd          31.3000    31.3000 +0%
>> > 447.dealII        68.7000    68.9000 +0.29%
>> > 450.soplex        47.3000    47.4000 +0.21%
>> > 453.povray        62.1000    61.4000 -1.12%
>> > 454.calculix      39.7000    39.3000 -1.00%
>> > 459.GemsFDTD      44.9000    45.0000 +0.22%
>> > 465.tonto         29.8000    29.8000 +0%
>> > 481.wrf           51.0000    51.2000 +0.39%
>> > 482.sphinx3       69.8000    71.2000 +2.00%
>
> I see 471.omnetpp and 482.sphinx3 are in a similar ballpark and it
> would be nice to catch the relevant case(s) with a cost model for
> epilogue vectorization without masking first (to get rid of
> --param vect-epilogues-nomask).
>
> As said elsewhere any non-conservative cost modeling (if the
> number of scalar iterations is not statically constant) might
> require versioning of the loop into a non-vectorized,
> short-trip vectorized and regular vectorized case (the Intel
> compiler does way more aggressive versioning IIRC).
>
> Richard.
>
>> > 3. epilogue vectorization using masking (4 benchmarks are not affected):
>> > 400.perlbench     47.5000    46.8000 -1.47%
>> > 401.bzip2         30.0000    29.9000 -0.33%
>> > 403.gcc           42.3000    42.3000 +0%
>> > 445.gobmk         32.1000    32.8000 +2.18%
>> > 456.hmmer         32.0000    32.0000 +0%
>> > 458.sjeng         36.1000    35.5000 -1.66%
>> > 462.libquantum    81.1000    81.1000 +0%
>> > 464.h264ref       65.4000    65.0000 -0.61%
>> > 483.xalancbmk     49.4000    49.3000 -0.20%
>> > 410.bwaves        95.9000    95.5000 -0.41%
>> > 416.gamess        42.8000    42.6000 -0.46%
>> > 433.milc          38.8000    39.1000 +0.77%
>> > 434.zeusmp        52.1000    51.3000 -1.53%
>> > 435.gromacs       32.9000    32.9000 +0%
>> > 436.cactusADM     78.8000    85.3000 +8.24%
>> > 437.leslie3d      55.4000    55.4000 +0%
>> > 444.namd          31.3000    31.3000 +0%
>> > 447.dealII        69.0000    69.2000 +0.28%
>> > 450.soplex        47.7000    47.6000 -0.20%
>> > 453.povray        62.2000    61.7000 -0.80%
>> > 454.calculix      39.7000    38.2000 -3.77%
>> > 459.GemsFDTD      44.9000    45.0000 +0.22%
>> > 465.tonto         29.8000    29.9000 +0.33%
>> > 481.wrf           51.2000    51.6000 +0.78%
>> > 482.sphinx3       70.3000    65.4000 -6.97%
>> >
>> > There is a good speed-up for 436 but there is essential slow0down on 482, 454.
>> >
>> > So In general we don't have any advantages for AVX2.
>> >
>> > Best regards.
>> > Yuri.
>> >
>> > P.S.
>> > I  am not able to provide you with avx512 numbers because i don't have
>> > an access to it.
>> > Updated patch will be sent later.
>> >
>> > Best regards.
>> > Yuri.
>> >
>> >
>> > 2016-12-05 15:44 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>> >> Richard,
>> >>
>> >> Sorry, U sent you the bad assembly produced for loop with low trip
>> >> count, here is the correct one:
>> >>
>> >> vmovdqa .LC0(%rip), %ymm0
>> >> vpmaskmovd b(%rip), %ymm0, %ymm1
>> >> vpmaskmovd c(%rip), %ymm0, %ymm2
>> >> vpaddd %ymm2, %ymm1, %ymm1
>> >> vpmaskmovd %ymm1, %ymm0, a(%rip)
>> >>
>> >> where .LC0 vector with all elements equal to -1 except for the last.
>> >>
>> >> Note also that additional option is required --param
>> >> vect-short-loops=1 to do such conversion.
>> >>
>> >> Best regards.
>> >> Yuri.
>> >>
>> >> 2016-12-02 18:59 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>> >>> Richard,
>> >>>
>> >>> Important clarification: the test I sent you with low trip count is
>> >>> vectorized through masking only under
>> >>> --param vect-epilogues-combine=1 -fvect-epilogue-cost-model=unlimited
>> >>> for avx2. The laast option isrequired for avx2 since masked store has
>> >>> big cost in comparison with masked load.
>> >>>
>> >>> Below is assemby produced for it:
>> >>> vpcmpeqd %xmm0, %xmm0, %xmm0
>> >>> vpmaskmovd b(%rip), %xmm0, %xmm1
>> >>> vpmaskmovd c(%rip), %xmm0, %xmm2
>> >>> vpaddd %xmm2, %xmm1, %xmm1
>> >>> vpmaskmovd %xmm1, %xmm0, a(%rip)
>> >>> ret
>> >>>
>> >>> Thanks.
>> >>> Yuri.
>> >>>
>> >>>
>> >>> 2016-12-02 18:49 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>> >>>> Richard,
>> >>>>
>> >>>> I have also question about low trip count loops.
>> >>>> Did you mean that
>> >>>> int a[128], b[128], c[128];
>> >>>>
>> >>>> void foo ()
>> >>>> {
>> >>>>   int i;
>> >>>>   for (i = 0; i<7; i++)
>> >>>>     a[i] = b[i] + c[i];
>> >>>> }
>> >>>>
>> >>>> must be vectorizzed with masking without epilogue creation (e.g. for avx2)?
>> >>>>
>> >>>> Currently it vectorized with vector size 128. I also noticed that
>> >>>> original Ilya patch does nothing for such masking.
>> >>>>
>> >>>> Thanks.
>> >>>> Yuri.
>> >>>>
>> >>>> 2016-12-02 17:08 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>> >>>>> Richard,
>> >>>>>
>> >>>>> You wrote:
>> >>>>> I don't see _any_ cost model for vectorizing the epilogue with
>> >>>>> masking?  Am I missing something?  A "trivial" cost model
>> >>>>> should at least consider the additional IV(s), the mask
>> >>>>> compute and the widening and narrowing ops required.
>> >>>>>
>> >>>>> I skipped all changes related to cost model assuming that one of the
>> >>>>> next patch will contain all cost model changes.
>> >>>>>
>> >>>>> Should I include it to this patch?
>> >>>>>
>> >>>>> Thanks.
>> >>>>> Yuri.
>> >>>>>
>> >>>>> 2016-12-01 17:45 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >>>>>> On Thu, 1 Dec 2016, Yuri Rumyantsev wrote:
>> >>>>>>
>> >>>>>>> Thanks Richard for your comments.
>> >>>>>>>
>> >>>>>>> You asked me about possible performance improvements for AVX2 machines
>> >>>>>>> - we did not see any visible speed-up for spec2k with any method of
>> >>>>>>
>> >>>>>> Spec 2000?  Can you check with SPEC 2006 or CPUv6?
>> >>>>>>
>> >>>>>> Did you see performance degradation?  What about compile-time and
>> >>>>>> binary size effects?
>> >>>>>>
>> >>>>>>> masking, including epilogue masking and combining, only on AVX512
>> >>>>>>> machine aka knl.
>> >>>>>>
>> >>>>>> I see.
>> >>>>>>
>> >>>>>> Note that as said in the initial review patch the cost model I
>> >>>>>> saw therein looked flawed.  In the end I'd expect a sensible
>> >>>>>> approach would be to do
>> >>>>>>
>> >>>>>>  if (n < scalar-most-profitable-niter)
>> >>>>>>    {
>> >>>>>>      no vectorization
>> >>>>>>    }
>> >>>>>>  else if (n < masking-more-profitable-than-not-masking-plus-epilogue)
>> >>>>>>    {
>> >>>>>>      do masked vectorization
>> >>>>>>    }
>> >>>>>>  else
>> >>>>>>    {
>> >>>>>>      do unmasked vectorization (with epilogue, eventually vectorized)
>> >>>>>>    }
>> >>>>>>
>> >>>>>> where for short trip loops the else path would never be taken
>> >>>>>> (statically).
>> >>>>>>
>> >>>>>> And yes, that means masking will only be useful for short-trip loops
>> >>>>>> which in the end means an overall performance benfit is unlikely
>> >>>>>> unless we have a lot of short-trip loops that are slow because of
>> >>>>>> the overhead of main unmasked loop plus epilogue.
>> >>>>>>
>> >>>>>> Richard.
>> >>>>>>
>> >>>>>>> I will answer on your question later.
>> >>>>>>>
>> >>>>>>> Best regards.
>> >>>>>>> Yuri
>> >>>>>>>
>> >>>>>>> 2016-12-01 14:33 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >>>>>>> > On Mon, 28 Nov 2016, Yuri Rumyantsev wrote:
>> >>>>>>> >
>> >>>>>>> >> Richard!
>> >>>>>>> >>
>> >>>>>>> >> I attached vect dump for hte part of attached test-case which
>> >>>>>>> >> illustrated how vectorization of epilogues works through masking:
>> >>>>>>> >> #define SIZE 1023
>> >>>>>>> >> #define ALIGN 64
>> >>>>>>> >>
>> >>>>>>> >> extern int posix_memalign(void **memptr, __SIZE_TYPE__ alignment,
>> >>>>>>> >> __SIZE_TYPE__ size) __attribute__((weak));
>> >>>>>>> >> extern void free (void *);
>> >>>>>>> >>
>> >>>>>>> >> void __attribute__((noinline))
>> >>>>>>> >> test_citer (int * __restrict__ a,
>> >>>>>>> >>    int * __restrict__ b,
>> >>>>>>> >>    int * __restrict__ c)
>> >>>>>>> >> {
>> >>>>>>> >>   int i;
>> >>>>>>> >>
>> >>>>>>> >>   a = (int *)__builtin_assume_aligned (a, ALIGN);
>> >>>>>>> >>   b = (int *)__builtin_assume_aligned (b, ALIGN);
>> >>>>>>> >>   c = (int *)__builtin_assume_aligned (c, ALIGN);
>> >>>>>>> >>
>> >>>>>>> >>   for (i = 0; i < SIZE; i++)
>> >>>>>>> >>     c[i] = a[i] + b[i];
>> >>>>>>> >> }
>> >>>>>>> >>
>> >>>>>>> >> It was compiled with -mavx2 --param vect-epilogues-mask=1 options.
>> >>>>>>> >>
>> >>>>>>> >> I did not include in this patch vectorization of low trip-count loops
>> >>>>>>> >> since in the original patch additional parameter was introduced:
>> >>>>>>> >> +DEFPARAM (PARAM_VECT_SHORT_LOOPS,
>> >>>>>>> >> +  "vect-short-loops",
>> >>>>>>> >> +  "Enable vectorization of low trip count loops using masking.",
>> >>>>>>> >> +  0, 0, 1)
>> >>>>>>> >>
>> >>>>>>> >> I assume that this ability can be included very quickly but it
>> >>>>>>> >> requires cost model enhancements also.
>> >>>>>>> >
>> >>>>>>> > Comments on the patch itself (as I'm having a closer look again,
>> >>>>>>> > I know how it vectorizes the above but I wondered why epilogue
>> >>>>>>> > and short-trip loops are not basically the same code path).
>> >>>>>>> >
>> >>>>>>> > Btw, I don't like that the features are behind a --param paywall.
>> >>>>>>> > That just means a) nobody will use it, b) it will bit-rot quickly,
>> >>>>>>> > c) bugs are well-hidden.
>> >>>>>>> >
>> >>>>>>> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>> >>>>>>> > +      && integer_zerop (nested_in_vect_loop
>> >>>>>>> > +                       ? STMT_VINFO_DR_STEP (stmt_info)
>> >>>>>>> > +                       : DR_STEP (dr)))
>> >>>>>>> > +    {
>> >>>>>>> > +      if (dump_enabled_p ())
>> >>>>>>> > +       dump_printf_loc (MSG_NOTE, vect_location,
>> >>>>>>> > +                        "allow invariant load for masked loop.\n");
>> >>>>>>> > +    }
>> >>>>>>> >
>> >>>>>>> > this can test memory_access_type == VMAT_INVARIANT.  Please put
>> >>>>>>> > all the checks in a common
>> >>>>>>> >
>> >>>>>>> >   if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> >>>>>>> >     {
>> >>>>>>> >        if (memory_access_type == VMAT_INVARIANT)
>> >>>>>>> >          {
>> >>>>>>> >          }
>> >>>>>>> >        else if (...)
>> >>>>>>> >          {
>> >>>>>>> >             LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>> >>>>>>> >          }
>> >>>>>>> >        else if (..)
>> >>>>>>> > ...
>> >>>>>>> >     }
>> >>>>>>> >
>> >>>>>>> > @@ -6667,6 +6756,15 @@ vectorizable_load (gimple *stmt,
>> >>>>>>> > gimple_stmt_iterator *gsi, gimple **vec_stmt,
>> >>>>>>> >        gcc_assert (!nested_in_vect_loop);
>> >>>>>>> >        gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
>> >>>>>>> >
>> >>>>>>> > +      if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> >>>>>>> > +       {
>> >>>>>>> > +         if (dump_enabled_p ())
>> >>>>>>> > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> >>>>>>> > +                            "cannot be masked: grouped access is not"
>> >>>>>>> > +                            " supported.");
>> >>>>>>> > +         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>> >>>>>>> > +      }
>> >>>>>>> > +
>> >>>>>>> >
>> >>>>>>> > isn't this already handled by the above?  Or rather the general
>> >>>>>>> > disallowance of SLP?
>> >>>>>>> >
>> >>>>>>> > @@ -5730,6 +5792,24 @@ vectorizable_store (gimple *stmt,
>> >>>>>>> > gimple_stmt_iterator *gsi, gimple **vec_stmt,
>> >>>>>>> >                             &memory_access_type, &gs_info))
>> >>>>>>> >      return false;
>> >>>>>>> >
>> >>>>>>> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>> >>>>>>> > +      && memory_access_type != VMAT_CONTIGUOUS)
>> >>>>>>> > +    {
>> >>>>>>> > +      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>> >>>>>>> > +      if (dump_enabled_p ())
>> >>>>>>> > +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> >>>>>>> > +                        "cannot be masked: unsupported memory access
>> >>>>>>> > type.\n");
>> >>>>>>> > +    }
>> >>>>>>> > +
>> >>>>>>> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>> >>>>>>> > +      && !can_mask_load_store (stmt))
>> >>>>>>> > +    {
>> >>>>>>> > +      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>> >>>>>>> > +      if (dump_enabled_p ())
>> >>>>>>> > +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> >>>>>>> > +                        "cannot be masked: unsupported masked store.\n");
>> >>>>>>> > +    }
>> >>>>>>> > +
>> >>>>>>> >
>> >>>>>>> > likewise please combine the ifs.
>> >>>>>>> >
>> >>>>>>> > @@ -2354,7 +2401,10 @@ vectorizable_mask_load_store (gimple *stmt,
>> >>>>>>> > gimple_stmt_iterator *gsi,
>> >>>>>>> >                                           ptr, vec_mask, vec_rhs);
>> >>>>>>> >           vect_finish_stmt_generation (stmt, new_stmt, gsi);
>> >>>>>>> >           if (i == 0)
>> >>>>>>> > -           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
>> >>>>>>> > +           {
>> >>>>>>> > +             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
>> >>>>>>> > +             STMT_VINFO_FIRST_COPY_P (vinfo_for_stmt (new_stmt)) = true;
>> >>>>>>> > +           }
>> >>>>>>> >           else
>> >>>>>>> >             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
>> >>>>>>> >           prev_stmt_info = vinfo_for_stmt (new_stmt);
>> >>>>>>> >
>> >>>>>>> > here you only set the flag, elsewhere you copy DR and VECTYPE as well.
>> >>>>>>> >
>> >>>>>>> > @@ -2113,6 +2146,20 @@ vectorizable_mask_load_store (gimple *stmt,
>> >>>>>>> > gimple_stmt_iterator *gsi,
>> >>>>>>> >                && !useless_type_conversion_p (vectype, rhs_vectype)))
>> >>>>>>> >      return false;
>> >>>>>>> >
>> >>>>>>> > +  if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> >>>>>>> > +    {
>> >>>>>>> > +      /* Check that mask conjuction is supported.  */
>> >>>>>>> > +      optab tab;
>> >>>>>>> > +      tab = optab_for_tree_code (BIT_AND_EXPR, vectype, optab_default);
>> >>>>>>> > +      if (!tab || optab_handler (tab, TYPE_MODE (vectype)) ==
>> >>>>>>> > CODE_FOR_nothing)
>> >>>>>>> > +       {
>> >>>>>>> > +         if (dump_enabled_p ())
>> >>>>>>> > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> >>>>>>> > +                            "cannot be masked: unsupported mask
>> >>>>>>> > operation\n");
>> >>>>>>> > +         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>> >>>>>>> > +       }
>> >>>>>>> > +    }
>> >>>>>>> >
>> >>>>>>> > does this really test whether we can bit-and the mask?  You are
>> >>>>>>> > using the vector type of the store (which might be V2DF for example),
>> >>>>>>> > also for AVX512 it might be a vector-bool type with integer mode?
>> >>>>>>> > Of course we maybe can simply assume mask conjunction is available
>> >>>>>>> > (I know no ISA where that would be not true).
>> >>>>>>> >
>> >>>>>>> > +/* Return true if STMT can be converted to masked form.  */
>> >>>>>>> > +
>> >>>>>>> > +static bool
>> >>>>>>> > +can_mask_load_store (gimple *stmt)
>> >>>>>>> > +{
>> >>>>>>> > +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>> >>>>>>> > +  tree vectype, mask_vectype;
>> >>>>>>> > +  tree lhs, ref;
>> >>>>>>> > +
>> >>>>>>> > +  if (!stmt_info)
>> >>>>>>> > +    return false;
>> >>>>>>> > +  lhs = gimple_assign_lhs (stmt);
>> >>>>>>> > +  ref = (TREE_CODE (lhs) == SSA_NAME) ? gimple_assign_rhs1 (stmt) : lhs;
>> >>>>>>> > +  if (may_be_nonaddressable_p (ref))
>> >>>>>>> > +    return false;
>> >>>>>>> > +  vectype = STMT_VINFO_VECTYPE (stmt_info);
>> >>>>>>> >
>> >>>>>>> > You probably modeled this after ifcvt_can_use_mask_load_store but I
>> >>>>>>> > don't think checking may_be_nonaddressable_p is necessary (we couldn't
>> >>>>>>> > even vectorize such refs).  stmt_info should never be NULL either.
>> >>>>>>> > With the check removed tree-ssa-loop-ivopts.h should no longer be
>> >>>>>>> > necessary.
>> >>>>>>> >
>> >>>>>>> > +static void
>> >>>>>>> > +vect_mask_load_store_stmt (gimple *stmt, tree vectype, tree mask,
>> >>>>>>> > +                          data_reference *dr, gimple_stmt_iterator *si)
>> >>>>>>> > +{
>> >>>>>>> > ...
>> >>>>>>> > +  addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (mem),
>> >>>>>>> > +                                  true, NULL_TREE, true,
>> >>>>>>> > +                                  GSI_SAME_STMT);
>> >>>>>>> > +
>> >>>>>>> > +  align = TYPE_ALIGN_UNIT (vectype);
>> >>>>>>> > +  if (aligned_access_p (dr))
>> >>>>>>> > +    misalign = 0;
>> >>>>>>> > +  else if (DR_MISALIGNMENT (dr) == -1)
>> >>>>>>> > +    {
>> >>>>>>> > +      align = TYPE_ALIGN_UNIT (elem_type);
>> >>>>>>> > +      misalign = 0;
>> >>>>>>> > +    }
>> >>>>>>> > +  else
>> >>>>>>> > +    misalign = DR_MISALIGNMENT (dr);
>> >>>>>>> > +  set_ptr_info_alignment (get_ptr_info (addr), align, misalign);
>> >>>>>>> > +  ptr = build_int_cst (reference_alias_ptr_type (mem),
>> >>>>>>> > +                      misalign ? misalign & -misalign : align);
>> >>>>>>> >
>> >>>>>>> > you should simply use
>> >>>>>>> >
>> >>>>>>> >   align = get_object_alignment (mem) / BITS_PER_UNIT;
>> >>>>>>> >
>> >>>>>>> > here rather than trying to be clever.  Eventually you don't need
>> >>>>>>> > the DR then (see question above).
>> >>>>>>> >
>> >>>>>>> > +    }
>> >>>>>>> > +  gsi_replace (si ? si : &gsi, new_stmt, false);
>> >>>>>>> >
>> >>>>>>> > when you replace the load/store please previously copy VUSE and VDEF
>> >>>>>>> > from the original one (we were nearly clean enough to no longer
>> >>>>>>> > require a virtual operand rewrite after vectorization...)  Thus
>> >>>>>>> >
>> >>>>>>> >   gimple_set_vuse (new_stmt, gimple_vuse (stmt));
>> >>>>>>> >   gimple_set_vdef (new_stmt, gimple_vdef (stmt));
>> >>>>>>> >
>> >>>>>>> > +static void
>> >>>>>>> > +vect_mask_loop (loop_vec_info loop_vinfo)
>> >>>>>>> > +{
>> >>>>>>> > ...
>> >>>>>>> > +  /* Scan all loop statements to convert vector load/store including
>> >>>>>>> > masked
>> >>>>>>> > +     form.  */
>> >>>>>>> > +  for (unsigned i = 0; i < loop->num_nodes; i++)
>> >>>>>>> > +    {
>> >>>>>>> > +      basic_block bb = bbs[i];
>> >>>>>>> > +      for (gimple_stmt_iterator si = gsi_start_bb (bb);
>> >>>>>>> > +          !gsi_end_p (si); gsi_next (&si))
>> >>>>>>> > +       {
>> >>>>>>> > +         gimple *stmt = gsi_stmt (si);
>> >>>>>>> > +         stmt_vec_info stmt_info = NULL;
>> >>>>>>> > +         tree vectype = NULL;
>> >>>>>>> > +         data_reference *dr;
>> >>>>>>> > +
>> >>>>>>> > +         /* Mask load case.  */
>> >>>>>>> > +         if (is_gimple_call (stmt)
>> >>>>>>> > +             && gimple_call_internal_p (stmt)
>> >>>>>>> > +             && gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
>> >>>>>>> > +             && !VECTOR_TYPE_P (TREE_TYPE (gimple_call_arg (stmt, 2))))
>> >>>>>>> > +           {
>> >>>>>>> > ...
>> >>>>>>> > +             /* Skip invariant loads.  */
>> >>>>>>> > +             if (integer_zerop (nested_in_vect_loop_p (loop, stmt)
>> >>>>>>> > +                                ? STMT_VINFO_DR_STEP (stmt_info)
>> >>>>>>> > +                                : DR_STEP (STMT_VINFO_DATA_REF
>> >>>>>>> > (stmt_info))))
>> >>>>>>> > +               continue;
>> >>>>>>> >
>> >>>>>>> > seeing this it would be nice if stmt_info had a flag for whether
>> >>>>>>> > the stmt needs masking (and a flag on wheter this is a scalar or a
>> >>>>>>> > vectorized stmt).
>> >>>>>>> >
>> >>>>>>> > +         /* Skip hoisted out statements.  */
>> >>>>>>> > +         if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
>> >>>>>>> > +           continue;
>> >>>>>>> >
>> >>>>>>> > err, you walk stmts in the loop!  Isn't this covered by the above
>> >>>>>>> > skipping of 'invariant loads'?
>> >>>>>>> >
>> >>>>>>> > +static gimple *
>> >>>>>>> > +vect_mask_reduction_stmt (gimple *stmt, tree mask, gimple *prev)
>> >>>>>>> > +{
>> >>>>>>> >
>> >>>>>>> > depending on the reduction operand there are variants that
>> >>>>>>> > could get away w/o the VEC_COND_EXPR, like
>> >>>>>>> >
>> >>>>>>> >   S1': tem_4 = d_3 & MASK;
>> >>>>>>> >   S2': r_1 = r_2 + tem_4;
>> >>>>>>> >
>> >>>>>>> > which works for plus at least.  More generally doing
>> >>>>>>> >
>> >>>>>>> >   S1': tem_4 = VEC_COND_EXPR<MASK, d_3, neutral operand>
>> >>>>>>> >   S2': r_1 = r_2 OP tem_4;
>> >>>>>>> >
>> >>>>>>> > and leaving optimization to & to later opts (& won't work for
>> >>>>>>> > AVX512 mask registers I guess).
>> >>>>>>> >
>> >>>>>>> > Good enough for later enhacement of course.
>> >>>>>>> >
>> >>>>>>> > +static void
>> >>>>>>> > +vect_gen_ivs_for_masking (loop_vec_info loop_vinfo, vec<tree> *ivs)
>> >>>>>>> > +{
>> >>>>>>> > ...
>> >>>>>>> >
>> >>>>>>> > isn't it enough to always create a single IV and derive the
>> >>>>>>> > additional copies by IV + i * { elems, elems, elems ... }?
>> >>>>>>> > IVs are expensive -- I'm sure we can optimize the rest of the
>> >>>>>>> > scheme further as well but this one looks obvious to me.
>> >>>>>>> >
>> >>>>>>> > @@ -3225,12 +3508,32 @@ vect_estimate_min_profitable_iters (loop_vec_info
>> >>>>>>> > loop_vinfo,
>> >>>>>>> >    int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
>> >>>>>>> >    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
>> >>>>>>> >
>> >>>>>>> > +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>> >>>>>>> > +    {
>> >>>>>>> > +      /* Currently we don't produce scalar epilogue version in case
>> >>>>>>> > +        its masked version is provided.  It means we don't need to
>> >>>>>>> > +        compute profitability one more time here.  Just make a
>> >>>>>>> > +        masked loop version.  */
>> >>>>>>> > +      if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>> >>>>>>> > +         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
>> >>>>>>> > +       {
>> >>>>>>> > +         dump_printf_loc (MSG_NOTE, vect_location,
>> >>>>>>> > +                          "cost model: mask loop epilogue.\n");
>> >>>>>>> > +         LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
>> >>>>>>> > +         *ret_min_profitable_niters = 0;
>> >>>>>>> > +         *ret_min_profitable_estimate = 0;
>> >>>>>>> > +         return;
>> >>>>>>> > +       }
>> >>>>>>> > +    }
>> >>>>>>> >    /* Cost model disabled.  */
>> >>>>>>> > -  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>> >>>>>>> > +  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>> >>>>>>> >      {
>> >>>>>>> >        dump_printf_loc (MSG_NOTE, vect_location, "cost model
>> >>>>>>> > disabled.\n");
>> >>>>>>> >        *ret_min_profitable_niters = 0;
>> >>>>>>> >        *ret_min_profitable_estimate = 0;
>> >>>>>>> > +      if (PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK)
>> >>>>>>> > +         && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> >>>>>>> > +       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
>> >>>>>>> >        return;
>> >>>>>>> >      }
>> >>>>>>> >
>> >>>>>>> > the unlimited_cost_model case should come first?  OTOH masking or
>> >>>>>>> > not is probably not sth covered by 'unlimited' - that is about
>> >>>>>>> > vectorizing or not.  But the above code means that for
>> >>>>>>> > epilogue vectorization w/o masking we ignore unlimited_cost_model ()?
>> >>>>>>> > That doesn't make sense to me.
>> >>>>>>> >
>> >>>>>>> > Plus if this is short-trip or epilogue vectorization and the
>> >>>>>>> > cost model is _not_ unlimited then we dont' want to enable
>> >>>>>>> > masking always (if it is possible).  It might be we statically
>> >>>>>>> > know the epilogue executes for at most two iterations for example.
>> >>>>>>> >
>> >>>>>>> > I don't see _any_ cost model for vectorizing the epilogue with
>> >>>>>>> > masking?  Am I missing something?  A "trivial" cost model
>> >>>>>>> > should at least consider the additional IV(s), the mask
>> >>>>>>> > compute and the widening and narrowing ops required.
>> >>>>>>> >
>> >>>>>>> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
>> >>>>>>> > index e13d6a2..36be342 100644
>> >>>>>>> > --- a/gcc/tree-vect-loop-manip.c
>> >>>>>>> > +++ b/gcc/tree-vect-loop-manip.c
>> >>>>>>> > @@ -1635,6 +1635,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
>> >>>>>>> > niters, tree nitersm1,
>> >>>>>>> >    bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
>> >>>>>>> >                          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
>> >>>>>>> >
>> >>>>>>> > +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>> >>>>>>> > +    {
>> >>>>>>> > +      prolog_peeling = false;
>> >>>>>>> > +      if (LOOP_VINFO_MASK_LOOP (loop_vinfo))
>> >>>>>>> > +       epilog_peeling = false;
>> >>>>>>> > +    }
>> >>>>>>> > +
>> >>>>>>> >    if (!prolog_peeling && !epilog_peeling)
>> >>>>>>> >      return NULL;
>> >>>>>>> >
>> >>>>>>> > I think the prolog_peeling was fixed during the epilogue vectorization
>> >>>>>>> > review and should no longer be necessary.  Please add
>> >>>>>>> > a && ! LOOP_VINFO_MASK_LOOP () to the epilog_peeling init instead
>> >>>>>>> > (it should also work for short-trip loop vectorization).
>> >>>>>>> >
>> >>>>>>> > @@ -2022,11 +2291,18 @@ start_over:
>> >>>>>>> >        || (max_niter != -1
>> >>>>>>> >           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
>> >>>>>>> >      {
>> >>>>>>> > -      if (dump_enabled_p ())
>> >>>>>>> > -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> >>>>>>> > -                        "not vectorized: iteration count smaller than "
>> >>>>>>> > -                        "vectorization factor.\n");
>> >>>>>>> > -      return false;
>> >>>>>>> > +      /* Allow low trip count for loop epilogue we want to mask.  */
>> >>>>>>> > +      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> >>>>>>> > +         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
>> >>>>>>> > +       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
>> >>>>>>> > +      else
>> >>>>>>> > +       {
>> >>>>>>> > +         if (dump_enabled_p ())
>> >>>>>>> >
>> >>>>>>> > so why do we test only LOOP_VINFO_EPILOGUE_P here?  All the code
>> >>>>>>> > I saw sofar would also work for the main loop (but the cost
>> >>>>>>> > model is missing).
>> >>>>>>> >
>> >>>>>>> > I am missing testcases.  There's only a single one but we should
>> >>>>>>> > have cases covering all kinds of mask IV widths and widen/shorten
>> >>>>>>> > masks.
>> >>>>>>> >
>> >>>>>>> > Do you have any numbers on SPEC 2k6 with epilogue vect and/or masking
>> >>>>>>> > enabled for an AVX2 machine?
>> >>>>>>> >
>> >>>>>>> > Oh, and I really dislike the --param paywall.
>> >>>>>>> >
>> >>>>>>> > Thanks,
>> >>>>>>> > Richard.
>> >>>>>>> >
>> >>>>>>> >> Best regards.
>> >>>>>>> >> Yuri.
>> >>>>>>> >>
>> >>>>>>> >>
>> >>>>>>> >> 2016-11-28 17:39 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >>>>>>> >> > On Thu, 24 Nov 2016, Yuri Rumyantsev wrote:
>> >>>>>>> >> >
>> >>>>>>> >> >> Hi All,
>> >>>>>>> >> >>
>> >>>>>>> >> >> Here is the second patch which supports epilogue vectorization using
>> >>>>>>> >> >> masking without cost model. Currently it is possible
>> >>>>>>> >> >> only with passing parameter "--param vect-epilogues-mask=1".
>> >>>>>>> >> >>
>> >>>>>>> >> >> Bootstrapping and regression testing did not show any new regression.
>> >>>>>>> >> >>
>> >>>>>>> >> >> Any comments will be appreciated.
>> >>>>>>> >> >
>> >>>>>>> >> > Going over the patch the main question is one how it works -- it looks
>> >>>>>>> >> > like the decision whether to vectorize & mask the epilogue is made
>> >>>>>>> >> > when vectorizing the loop that generates the epilogue rather than
>> >>>>>>> >> > in the epilogue vectorization path?
>> >>>>>>> >> >
>> >>>>>>> >> > That is, I'd have expected to see this handling low-trip count loops
>> >>>>>>> >> > by masking?  And thus masking the epilogue simply by it being
>> >>>>>>> >> > low-trip count?
>> >>>>>>> >> >
>> >>>>>>> >> > Richard.
>> >>>>>>> >> >
>> >>>>>>> >> >> ChangeLog:
>> >>>>>>> >> >> 2016-11-24  Yuri Rumyantsev  <ysrumyan@gmail.com>
>> >>>>>>> >> >>
>> >>>>>>> >> >> * params.def (PARAM_VECT_EPILOGUES_MASK): New.
>> >>>>>>> >> >> * tree-vect-data-refs.c (vect_get_new_ssa_name): Support vect_mask_var.
>> >>>>>>> >> >> * tree-vect-loop.c: Include insn-config.h, recog.h and alias.h.
>> >>>>>>> >> >> (new_loop_vec_info): Add zeroing can_be_masked, mask_loop and
>> >>>>>>> >> >> required_mask fields.
>> >>>>>>> >> >> (vect_check_required_masks_widening): New.
>> >>>>>>> >> >> (vect_check_required_masks_narrowing): New.
>> >>>>>>> >> >> (vect_get_masking_iv_elems): New.
>> >>>>>>> >> >> (vect_get_masking_iv_type): New.
>> >>>>>>> >> >> (vect_get_extreme_masks): New.
>> >>>>>>> >> >> (vect_check_required_masks): New.
>> >>>>>>> >> >> (vect_analyze_loop_operations): Call vect_check_required_masks if all
>> >>>>>>> >> >> statements can be masked.
>> >>>>>>> >> >> (vect_analyze_loop_2): Inititalize to zero min_scalar_loop_bound.
>> >>>>>>> >> >> Add check that epilogue can be masked with the same vf with issue
>> >>>>>>> >> >> fail notes.  Allow epilogue vectorization through masking of low trip
>> >>>>>>> >> >> loops. Set to true can_be_masked field before loop operation analysis.
>> >>>>>>> >> >> Do not set-up min_scalar_loop_bound for epilogue vectorization through
>> >>>>>>> >> >> masking.  Do not peeling for epilogue masking.  Reset can_be_masked
>> >>>>>>> >> >> field before repeat analysis.
>> >>>>>>> >> >> (vect_estimate_min_profitable_iters): Do not compute profitability
>> >>>>>>> >> >> for epilogue masking.  Set up mask_loop filed to true if parameter
>> >>>>>>> >> >> PARAM_VECT_EPILOGUES_MASK is non-zero.
>> >>>>>>> >> >> (vectorizable_reduction): Add check that statement can be masked.
>> >>>>>>> >> >> (vectorizable_induction): Do not support masking for induction.
>> >>>>>>> >> >> (vect_gen_ivs_for_masking): New.
>> >>>>>>> >> >> (vect_get_mask_index_for_elems): New.
>> >>>>>>> >> >> (vect_get_mask_index_for_type): New.
>> >>>>>>> >> >> (vect_create_narrowed_masks): New.
>> >>>>>>> >> >> (vect_create_widened_masks): New.
>> >>>>>>> >> >> (vect_gen_loop_masks): New.
>> >>>>>>> >> >> (vect_mask_reduction_stmt): New.
>> >>>>>>> >> >> (vect_mask_mask_load_store_stmt): New.
>> >>>>>>> >> >> (vect_mask_load_store_stmt): New.
>> >>>>>>> >> >> (vect_mask_loop): New.
>> >>>>>>> >> >> (vect_transform_loop): Invoke vect_mask_loop if required.
>> >>>>>>> >> >> Use div_ceil to recompute upper bounds for masked loops.  Issue
>> >>>>>>> >> >> statistics for epilogue vectorization through masking. Do not reduce
>> >>>>>>> >> >> vf for masking epilogue.
>> >>>>>>> >> >> * tree-vect-stmts.c: Include tree-ssa-loop-ivopts.h.
>> >>>>>>> >> >> (can_mask_load_store): New.
>> >>>>>>> >> >> (vectorizable_mask_load_store): Check that mask conjuction is
>> >>>>>>> >> >> supported.  Set-up first_copy_p field of stmt_vinfo.
>> >>>>>>> >> >> (vectorizable_simd_clone_call): Check that simd clone can not be
>> >>>>>>> >> >> masked.
>> >>>>>>> >> >> (vectorizable_store): Check that store can be masked. Mark the first
>> >>>>>>> >> >> copy of generated vector stores and provide it with vectype and the
>> >>>>>>> >> >> original data reference.
>> >>>>>>> >> >> (vectorizable_load): Check that load can be masked.
>> >>>>>>> >> >> (vect_stmt_should_be_masked_for_epilogue): New.
>> >>>>>>> >> >> (vect_add_required_mask_for_stmt): New.
>> >>>>>>> >> >> (vect_analyze_stmt): Add check on unsupported statements for masking
>> >>>>>>> >> >> with printing message.
>> >>>>>>> >> >> * tree-vectorizer.h (struct _loop_vec_info): Add new fields
>> >>>>>>> >> >> can_be_maske, required_masks, masl_loop.
>> >>>>>>> >> >> (LOOP_VINFO_CAN_BE_MASKED): New.
>> >>>>>>> >> >> (LOOP_VINFO_REQUIRED_MASKS): New.
>> >>>>>>> >> >> (LOOP_VINFO_MASK_LOOP): New.
>> >>>>>>> >> >> (struct _stmt_vec_info): Add first_copy_p field.
>> >>>>>>> >> >> (STMT_VINFO_FIRST_COPY_P): New.
>> >>>>>>> >> >>
>> >>>>>>> >> >> gcc/testsuite/
>> >>>>>>> >> >>
>> >>>>>>> >> >> * gcc.dg/vect/vect-tail-mask-1.c: New test.
>> >>>>>>> >> >>
>> >>>>>>> >> >> 2016-11-18 18:54 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
>> >>>>>>> >> >> > On 18 November 2016 at 16:46, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> >>>>>>> >> >> >> It is very strange that this test failed on arm, since it requires
>> >>>>>>> >> >> >> target avx2 to check vectorizer dumps:
>> >>>>>>> >> >> >>
>> >>>>>>> >> >> >> /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" {
>> >>>>>>> >> >> >> target avx2_runtime } } } */
>> >>>>>>> >> >> >> /* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED
>> >>>>>>> >> >> >> \\(VS=16\\)" 2 "vect" { target avx2_runtime } } } */
>> >>>>>>> >> >> >>
>> >>>>>>> >> >> >> Could you please clarify what is the reason of the failure?
>> >>>>>>> >> >> >
>> >>>>>>> >> >> > It's not the scan-dumps that fail, but the execution.
>> >>>>>>> >> >> > The test calls abort() for some reason.
>> >>>>>>> >> >> >
>> >>>>>>> >> >> > It will take me a while to rebuild the test manually in the right
>> >>>>>>> >> >> > debug environment to provide you with more traces.
>> >>>>>>> >> >> >
>> >>>>>>> >> >> >
>> >>>>>>> >> >> >
>> >>>>>>> >> >> >>
>> >>>>>>> >> >> >> Thanks.
>> >>>>>>> >> >> >>
>> >>>>>>> >> >> >> 2016-11-18 16:20 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
>> >>>>>>> >> >> >>> On 15 November 2016 at 15:41, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> >>>>>>> >> >> >>>> Hi All,
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> Here is patch for non-masked epilogue vectoriziation.
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> Bootstrap and regression testing did not show any new failures.
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> Is it OK for trunk?
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> Thanks.
>> >>>>>>> >> >> >>>> Changelog:
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> 2016-11-15  Yuri Rumyantsev  <ysrumyan@gmail.com>
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> * params.def (PARAM_VECT_EPILOGUES_NOMASK): New.
>> >>>>>>> >> >> >>>> * tree-if-conv.c (tree_if_conversion): Make public.
>> >>>>>>> >> >> >>>> * * tree-if-conv.h: New file.
>> >>>>>>> >> >> >>>> * tree-vect-data-refs.c (vect_analyze_data_ref_dependences) Avoid
>> >>>>>>> >> >> >>>> dynamic alias checks for epilogues.
>> >>>>>>> >> >> >>>> * tree-vect-loop-manip.c (vect_do_peeling): Return created epilog.
>> >>>>>>> >> >> >>>> * tree-vect-loop.c: include tree-if-conv.h.
>> >>>>>>> >> >> >>>> (new_loop_vec_info): Add zeroing orig_loop_info field.
>> >>>>>>> >> >> >>>> (vect_analyze_loop_2): Don't try to enhance alignment for epilogues.
>> >>>>>>> >> >> >>>> (vect_analyze_loop): Add argument ORIG_LOOP_INFO which is not NULL
>> >>>>>>> >> >> >>>> if epilogue is vectorized, set up orig_loop_info field of loop_vinfo
>> >>>>>>> >> >> >>>> using passed argument.
>> >>>>>>> >> >> >>>> (vect_transform_loop): Check if created epilogue should be returned
>> >>>>>>> >> >> >>>> for further vectorization with less vf.  If-convert epilogue if
>> >>>>>>> >> >> >>>> required. Print vectorization success for epilogue.
>> >>>>>>> >> >> >>>> * tree-vectorizer.c (vectorize_loops): Add epilogue vectorization
>> >>>>>>> >> >> >>>> if it is required, pass loop_vinfo produced during vectorization of
>> >>>>>>> >> >> >>>> loop body to vect_analyze_loop.
>> >>>>>>> >> >> >>>> * tree-vectorizer.h (struct _loop_vec_info): Add new field
>> >>>>>>> >> >> >>>> orig_loop_info.
>> >>>>>>> >> >> >>>> (LOOP_VINFO_ORIG_LOOP_INFO): New.
>> >>>>>>> >> >> >>>> (LOOP_VINFO_EPILOGUE_P): New.
>> >>>>>>> >> >> >>>> (LOOP_VINFO_ORIG_VECT_FACTOR): New.
>> >>>>>>> >> >> >>>> (vect_do_peeling): Change prototype to return epilogue.
>> >>>>>>> >> >> >>>> (vect_analyze_loop): Add argument of loop_vec_info type.
>> >>>>>>> >> >> >>>> (vect_transform_loop): Return created loop.
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> gcc/testsuite/
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> * lib/target-supports.exp (check_avx2_hw_available): New.
>> >>>>>>> >> >> >>>> (check_effective_target_avx2_runtime): New.
>> >>>>>>> >> >> >>>> * gcc.dg/vect/vect-tail-nomask-1.c: New test.
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>
>> >>>>>>> >> >> >>> Hi,
>> >>>>>>> >> >> >>>
>> >>>>>>> >> >> >>> This new test fails on arm-none-eabi (using default cpu/fpu/mode):
>> >>>>>>> >> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c -flto -ffat-lto-objects execution test
>> >>>>>>> >> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c execution test
>> >>>>>>> >> >> >>>
>> >>>>>>> >> >> >>> It does pass on the same target if configured --with-cpu=cortex-a9.
>> >>>>>>> >> >> >>>
>> >>>>>>> >> >> >>> Christophe
>> >>>>>>> >> >> >>>
>> >>>>>>> >> >> >>>
>> >>>>>>> >> >> >>>
>> >>>>>>> >> >> >>>>
>> >>>>>>> >> >> >>>> 2016-11-14 20:04 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >>>>>>> >> >> >>>>> On November 14, 2016 4:39:40 PM GMT+01:00, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> >>>>>>> >> >> >>>>>>Richard,
>> >>>>>>> >> >> >>>>>>
>> >>>>>>> >> >> >>>>>>I checked one of the tests designed for epilogue vectorization using
>> >>>>>>> >> >> >>>>>>patches 1 - 3 and found out that build compiler performs vectorization
>> >>>>>>> >> >> >>>>>>of epilogues with --param vect-epilogues-nomask=1 passed:
>> >>>>>>> >> >> >>>>>>
>> >>>>>>> >> >> >>>>>>$ gcc -Ofast -mavx2 t1.c -S --param vect-epilogues-nomask=1 -o
>> >>>>>>> >> >> >>>>>>t1.new-nomask.s -fdump-tree-vect-details
>> >>>>>>> >> >> >>>>>>$ grep VECTORIZED -c t1.c.156t.vect
>> >>>>>>> >> >> >>>>>>4
>> >>>>>>> >> >> >>>>>> Without param only 2 loops are vectorized.
>> >>>>>>> >> >> >>>>>>
>> >>>>>>> >> >> >>>>>>Should I simply add a part of tests related to this feature or I must
>> >>>>>>> >> >> >>>>>>delete all not necessary changes also?
>> >>>>>>> >> >> >>>>>
>> >>>>>>> >> >> >>>>> Please remove all not necessary changes.
>> >>>>>>> >> >> >>>>>
>> >>>>>>> >> >> >>>>> Richard.
>> >>>>>>> >> >> >>>>>
>> >>>>>>> >> >> >>>>>>Thanks.
>> >>>>>>> >> >> >>>>>>Yuri.
>> >>>>>>> >> >> >>>>>>
>> >>>>>>> >> >> >>>>>>2016-11-14 16:40 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >>>>>>> >> >> >>>>>>> On Mon, 14 Nov 2016, Yuri Rumyantsev wrote:
>> >>>>>>> >> >> >>>>>>>
>> >>>>>>> >> >> >>>>>>>> Richard,
>> >>>>>>> >> >> >>>>>>>>
>> >>>>>>> >> >> >>>>>>>> In my previous patch I forgot to remove couple lines related to aux
>> >>>>>>> >> >> >>>>>>field.
>> >>>>>>> >> >> >>>>>>>> Here is the correct updated patch.
>> >>>>>>> >> >> >>>>>>>
>> >>>>>>> >> >> >>>>>>> Yeah, I noticed.  This patch would be ok for trunk (together with
>> >>>>>>> >> >> >>>>>>> necessary parts from 1 and 2) if all not required parts are removed
>> >>>>>>> >> >> >>>>>>> (and you'd add the testcases covering non-masked tail vect).
>> >>>>>>> >> >> >>>>>>>
>> >>>>>>> >> >> >>>>>>> Thus, can you please produce a single complete patch containing only
>> >>>>>>> >> >> >>>>>>> non-masked epilogue vectoriziation?
>> >>>>>>> >> >> >>>>>>>
>> >>>>>>> >> >> >>>>>>> Thanks,
>> >>>>>>> >> >> >>>>>>> Richard.
>> >>>>>>> >> >> >>>>>>>
>> >>>>>>> >> >> >>>>>>>> Thanks.
>> >>>>>>> >> >> >>>>>>>> Yuri.
>> >>>>>>> >> >> >>>>>>>>
>> >>>>>>> >> >> >>>>>>>> 2016-11-14 15:51 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >>>>>>> >> >> >>>>>>>> > On Fri, 11 Nov 2016, Yuri Rumyantsev wrote:
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> >> Richard,
>> >>>>>>> >> >> >>>>>>>> >>
>> >>>>>>> >> >> >>>>>>>> >> I prepare updated 3 patch with passing additional argument to
>> >>>>>>> >> >> >>>>>>>> >> vect_analyze_loop as you proposed (untested).
>> >>>>>>> >> >> >>>>>>>> >>
>> >>>>>>> >> >> >>>>>>>> >> You wrote:
>> >>>>>>> >> >> >>>>>>>> >> tw, I wonder if you can produce a single patch containing just
>> >>>>>>> >> >> >>>>>>>> >> epilogue vectorization, that is combine patches 1-3 but rip out
>> >>>>>>> >> >> >>>>>>>> >> changes only needed by later patches?
>> >>>>>>> >> >> >>>>>>>> >>
>> >>>>>>> >> >> >>>>>>>> >> Did you mean that I exclude all support for vectorization
>> >>>>>>> >> >> >>>>>>epilogues,
>> >>>>>>> >> >> >>>>>>>> >> i.e. exclude from 2-nd patch all non-related changes
>> >>>>>>> >> >> >>>>>>>> >> like
>> >>>>>>> >> >> >>>>>>>> >>
>> >>>>>>> >> >> >>>>>>>> >> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>> >>>>>>> >> >> >>>>>>>> >> index 11863af..32011c1 100644
>> >>>>>>> >> >> >>>>>>>> >> --- a/gcc/tree-vect-loop.c
>> >>>>>>> >> >> >>>>>>>> >> +++ b/gcc/tree-vect-loop.c
>> >>>>>>> >> >> >>>>>>>> >> @@ -1120,6 +1120,12 @@ new_loop_vec_info (struct loop *loop)
>> >>>>>>> >> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
>> >>>>>>> >> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_NITER (res) = false;
>> >>>>>>> >> >> >>>>>>>> >>    LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_CAN_BE_MASKED (res) = false;
>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_REQUIRED_MASKS (res) = 0;
>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_COMBINE_EPILOGUE (res) = false;
>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_MASK_EPILOGUE (res) = false;
>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_NEED_MASKING (res) = false;
>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL;
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> > Yes.
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> >> Did you mean also that new combined patch must be working patch,
>> >>>>>>> >> >> >>>>>>i.e.
>> >>>>>>> >> >> >>>>>>>> >> can be integrated without other patches?
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> > Yes.
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> >> Could you please look at updated patch?
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> > Will do.
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> > Thanks,
>> >>>>>>> >> >> >>>>>>>> > Richard.
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> >> Thanks.
>> >>>>>>> >> >> >>>>>>>> >> Yuri.
>> >>>>>>> >> >> >>>>>>>> >>
>> >>>>>>> >> >> >>>>>>>> >> 2016-11-10 15:36 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> >>>>>>> >> >> >>>>>>>> >> > On Thu, 10 Nov 2016, Richard Biener wrote:
>> >>>>>>> >> >> >>>>>>>> >> >
>> >>>>>>> >> >> >>>>>>>> >> >> On Tue, 8 Nov 2016, Yuri Rumyantsev wrote:
>> >>>>>>> >> >> >>>>>>>> >> >>
>> >>>>>>> >> >> >>>>>>>> >> >> > Richard,
>> >>>>>>> >> >> >>>>>>>> >> >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > Here is updated 3 patch.
>> >>>>>>> >> >> >>>>>>>> >> >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > I checked that all new tests related to epilogue
>> >>>>>>> >> >> >>>>>>vectorization passed with it.
>> >>>>>>> >> >> >>>>>>>> >> >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > Your comments will be appreciated.
>> >>>>>>> >> >> >>>>>>>> >> >>
>> >>>>>>> >> >> >>>>>>>> >> >> A lot better now.  Instead of the ->aux dance I now prefer to
>> >>>>>>> >> >> >>>>>>>> >> >> pass the original loops loop_vinfo to vect_analyze_loop as
>> >>>>>>> >> >> >>>>>>>> >> >> optional argument (if non-NULL we analyze the epilogue of that
>> >>>>>>> >> >> >>>>>>>> >> >> loop_vinfo).  OTOH I remember we mainly use it to get at the
>> >>>>>>> >> >> >>>>>>>> >> >> original vectorization factor?  So we can pass down an
>> >>>>>>> >> >> >>>>>>(optional)
>> >>>>>>> >> >> >>>>>>>> >> >> forced vectorization factor as well?
>> >>>>>>> >> >> >>>>>>>> >> >
>> >>>>>>> >> >> >>>>>>>> >> > Btw, I wonder if you can produce a single patch containing just
>> >>>>>>> >> >> >>>>>>>> >> > epilogue vectorization, that is combine patches 1-3 but rip out
>> >>>>>>> >> >> >>>>>>>> >> > changes only needed by later patches?
>> >>>>>>> >> >> >>>>>>>> >> >
>> >>>>>>> >> >> >>>>>>>> >> > Thanks,
>> >>>>>>> >> >> >>>>>>>> >> > Richard.
>> >>>>>>> >> >> >>>>>>>> >> >
>> >>>>>>> >> >> >>>>>>>> >> >> Richard.
>> >>>>>>> >> >> >>>>>>>> >> >>
>> >>>>>>> >> >> >>>>>>>> >> >> > 2016-11-08 15:38 GMT+03:00 Richard Biener
>> >>>>>>> >> >> >>>>>><rguenther@suse.de>:
>> >>>>>>> >> >> >>>>>>>> >> >> > > On Thu, 3 Nov 2016, Yuri Rumyantsev wrote:
>> >>>>>>> >> >> >>>>>>>> >> >> > >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> Hi Richard,
>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >> I did not understand your last remark:
>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >           && dump_enabled_p ())
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
>> >>>>>>> >> >> >>>>>>vect_location,
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >                            "loop vectorized\n");
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >        /* Now that the loop has been vectorized, allow
>> >>>>>>> >> >> >>>>>>it to be unrolled
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >           etc.  */
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >      loop->force_vectorize = false;
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
>> >>>>>>> >> >> >>>>>>it easier
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
>> >>>>>>> >> >> >>>>>>in dumps
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
>> >>>>>>> >> >> >>>>>>*/
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       if (new_loop)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +         {
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +         }
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
>> >>>>>>> >> >> >>>>>>new_loop)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> f> unction which will set up stuff properly (and also
>> >>>>>>> >> >> >>>>>>perform
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
>> >>>>>>> >> >> >>>>>>vectorization
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > separately that would be great.
>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >> Could you please clarify your proposal.
>> >>>>>>> >> >> >>>>>>>> >> >> > >
>> >>>>>>> >> >> >>>>>>>> >> >> > > When a loop was vectorized set things up to immediately
>> >>>>>>> >> >> >>>>>>vectorize
>> >>>>>>> >> >> >>>>>>>> >> >> > > its epilogue, avoiding changing the loop iteration and
>> >>>>>>> >> >> >>>>>>avoiding
>> >>>>>>> >> >> >>>>>>>> >> >> > > the re-use of ->aux.
>> >>>>>>> >> >> >>>>>>>> >> >> > >
>> >>>>>>> >> >> >>>>>>>> >> >> > > Richard.
>> >>>>>>> >> >> >>>>>>>> >> >> > >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> Thanks.
>> >>>>>>> >> >> >>>>>>>> >> >> > >> Yuri.
>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >> 2016-11-02 15:27 GMT+03:00 Richard Biener
>> >>>>>>> >> >> >>>>>><rguenther@suse.de>:
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > On Tue, 1 Nov 2016, Yuri Rumyantsev wrote:
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> Hi All,
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> I re-send all patches sent by Ilya earlier for review
>> >>>>>>> >> >> >>>>>>which support
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> vectorization of loop epilogues and loops with low
>> >>>>>>> >> >> >>>>>>trip count. We
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> assume that the only patch -
>> >>>>>>> >> >> >>>>>>vec-tails-07-combine-tail.patch - was not
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> approved by Jeff.
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> I did re-base of all patches and performed
>> >>>>>>> >> >> >>>>>>bootstrapping and
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> regression testing that did not show any new failures.
>> >>>>>>> >> >> >>>>>>Also all
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> changes related to new vect_do_peeling algorithm have
>> >>>>>>> >> >> >>>>>>been changed
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> accordingly.
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> Is it OK for trunk?
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > I would have prefered that the series up to
>> >>>>>>> >> >> >>>>>>-03-nomask-tails would
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > _only_ contain epilogue loop vectorization changes but
>> >>>>>>> >> >> >>>>>>unfortunately
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > the patchset is oddly separated.
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > I have a comment on that part nevertheless:
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > @@ -1608,7 +1614,10 @@ vect_enhance_data_refs_alignment
>> >>>>>>> >> >> >>>>>>(loop_vec_info
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > loop_vinfo)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    /* Check if we can possibly peel the loop.  */
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    if (!vect_can_advance_ivs_p (loop_vinfo)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >        || !slpeel_can_duplicate_loop_p (loop,
>> >>>>>>> >> >> >>>>>>single_exit (loop))
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > -      || loop->inner)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +      || loop->inner
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +      /* Required peeling was performed in prologue
>> >>>>>>> >> >> >>>>>>and
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +        is not required for epilogue.  */
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +      || LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >      do_peeling = false;
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    if (do_peeling
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > @@ -1888,7 +1897,10 @@ vect_enhance_data_refs_alignment
>> >>>>>>> >> >> >>>>>>(loop_vec_info
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > loop_vinfo)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    do_versioning =
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         optimize_loop_nest_for_speed_p (loop)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > -       && (!loop->inner); /* FORNOW */
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       && (!loop->inner) /* FORNOW */
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +        /* Required versioning was performed for the
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          original loop and is not required for
>> >>>>>>> >> >> >>>>>>epilogue.  */
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    if (do_versioning)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >      {
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > please do that check in the single caller of this
>> >>>>>>> >> >> >>>>>>function.
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > Otherwise I still dislike the new ->aux use and I
>> >>>>>>> >> >> >>>>>>believe that simply
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > passing down info from the processed parent would be
>> >>>>>>> >> >> >>>>>>_much_ cleaner.
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >             && dump_enabled_p ())
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >            dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
>> >>>>>>> >> >> >>>>>>vect_location,
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >                             "loop vectorized\n");
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         /* Now that the loop has been vectorized, allow
>> >>>>>>> >> >> >>>>>>it to be unrolled
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >            etc.  */
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         loop->force_vectorize = false;
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
>> >>>>>>> >> >> >>>>>>it easier
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
>> >>>>>>> >> >> >>>>>>in dumps
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
>> >>>>>>> >> >> >>>>>>*/
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       if (new_loop)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +         {
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +         }
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
>> >>>>>>> >> >> >>>>>>new_loop)
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > function which will set up stuff properly (and also
>> >>>>>>> >> >> >>>>>>perform
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
>> >>>>>>> >> >> >>>>>>vectorization
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > separately that would be great.
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > I'm still torn about all the rest of the stuff and
>> >>>>>>> >> >> >>>>>>question its
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > usability (esp. merging the epilogue with the main
>> >>>>>>> >> >> >>>>>>vector loop).
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > But it has already been approved ... oh well.
>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > Thanks,
>> >>>>>>> >> >> >>>>>>>> >> >> > >> > Richard.
>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>> >>>>>>> >> >> >>>>>>>> >> >> > >
>> >>>>>>> >> >> >>>>>>>> >> >> > > --
>> >>>>>>> >> >> >>>>>>>> >> >> > > Richard Biener <rguenther@suse.de>
>> >>>>>>> >> >> >>>>>>>> >> >> > > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard,
>> >>>>>>> >> >> >>>>>>Graham Norton, HRB 21284 (AG Nuernberg)
>> >>>>>>> >> >> >>>>>>>> >> >> >
>> >>>>>>> >> >> >>>>>>>> >> >>
>> >>>>>>> >> >> >>>>>>>> >> >>
>> >>>>>>> >> >> >>>>>>>> >> >
>> >>>>>>> >> >> >>>>>>>> >> > --
>> >>>>>>> >> >> >>>>>>>> >> > Richard Biener <rguenther@suse.de>
>> >>>>>>> >> >> >>>>>>>> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>> >>>>>>> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>> >>>>>>> >> >> >>>>>>>> >>
>> >>>>>>> >> >> >>>>>>>> >
>> >>>>>>> >> >> >>>>>>>> > --
>> >>>>>>> >> >> >>>>>>>> > Richard Biener <rguenther@suse.de>
>> >>>>>>> >> >> >>>>>>>> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>> >>>>>>> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>> >>>>>>> >> >> >>>>>>>>
>> >>>>>>> >> >> >>>>>>>
>> >>>>>>> >> >> >>>>>>> --
>> >>>>>>> >> >> >>>>>>> Richard Biener <rguenther@suse.de>
>> >>>>>>> >> >> >>>>>>> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>> >>>>>>> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>> >>>>>>> >> >> >>>>>
>> >>>>>>> >> >> >>>>>
>> >>>>>>> >> >>
>> >>>>>>> >> >
>> >>>>>>> >> > --
>> >>>>>>> >> > Richard Biener <rguenther@suse.de>
>> >>>>>>> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
>> >>>>>>> >>
>> >>>>>>> >
>> >>>>>>> > --
>> >>>>>>> > Richard Biener <rguenther@suse.de>
>> >>>>>>> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
>> >>>>>>>
>> >>>>>>>
>> >>>>>>
>> >>>>>> --
>> >>>>>> Richard Biener <rguenther@suse.de>
>> >>>>>> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
>>
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
Yuri Rumyantsev Dec. 21, 2016, 4:32 p.m. UTC | #4
Sorry,

I put wrong test - fix it here.

2016-12-21 13:12 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
> Hi Richard,
>
> I occasionally found out a bug in my patch related to epilogue
> vectorization without masking : need to put label before
> initialization.
>
> Could you please review and integrate it to trunk. Test-case is also attached.
>
>
> Thanks ahead.
> Yuri.
>
> ChangeLog:
> 2016-12-21  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * tree-vectorizer.c (vectorize_loops): Put label before initialization
> of loop_vectorized_call.
>
> gcc/testsuite/
>
> * gcc.dg/vect/vect-tail-nomask-2.c: New test.
>
> 2016-12-13 16:59 GMT+03:00 Richard Biener <rguenther@suse.de>:
>> On Mon, 12 Dec 2016, Yuri Rumyantsev wrote:
>>
>>> Richard,
>>>
>>> Could you please review cost model patch before to include it to
>>> epilogue masking patch and add masking cost estimation as you
>>> requested.
>>
>> That's just the middle-end / target changes.  I was not 100% happy
>> with them but well, the vectorizer cost modeling needs work
>> (aka another rewrite).
>>
>> From below...
>>
>>> Thanks.
>>>
>>> Patch and ChangeLog are attached.
>>>
>>> 2016-12-12 15:47 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>> > Hi Richard,
>>> >
>>> > You asked me about performance of spec2006 on AVX2 machine with new feature.
>>> >
>>> > I tried the following on Haswell using original patch designed by Ilya.
>>> > 1. Masking low trip count loops  only 6 benchmarks are affected and
>>> > performance is almost the same
>>> > 464.h264ref     63.9000    64.0000 +0.15%
>>> > 416.gamess      42.9000    42.9000 +0%
>>> > 435.gromacs     32.8000    32.7000 -0.30%
>>> > 447.dealII      68.5000    68.3000 -0.29%
>>> > 453.povray      61.9000    62.1000 +0.32%
>>> > 454.calculix    39.8000    39.8000 +0%
>>> > 465.tonto       29.9000    29.9000 +0%
>>> >
>>> > 2. epilogue vectorization without masking (use less vf) (3 benchmarks
>>> > are not affected)
>>> > 400.perlbench     47.2000    46.5000 -1.48%
>>> > 401.bzip2         29.9000    29.9000 +0%
>>> > 403.gcc           41.8000    41.6000 -0.47%
>>> > 456.hmmer         32.0000    32.0000 +0%
>>> > 462.libquantum    81.5000    82.0000 +0.61%
>>> > 464.h264ref       65.0000    65.5000 +0.76%
>>> > 471.omnetpp       27.8000    28.2000 +1.43%
>>> > 473.astar         28.7000    28.6000 -0.34%
>>> > 483.xalancbmk     48.7000    48.6000 -0.20%
>>> > 410.bwaves        95.3000    95.3000 +0%
>>> > 416.gamess        42.9000    42.8000 -0.23%
>>> > 433.milc          38.8000    38.8000 +0%
>>> > 434.zeusmp        51.7000    51.4000 -0.58%
>>> > 435.gromacs       32.8000    32.8000 +0%
>>> > 436.cactusADM     85.0000    83.0000 -2.35%
>>> > 437.leslie3d      55.5000    55.5000 +0%
>>> > 444.namd          31.3000    31.3000 +0%
>>> > 447.dealII        68.7000    68.9000 +0.29%
>>> > 450.soplex        47.3000    47.4000 +0.21%
>>> > 453.povray        62.1000    61.4000 -1.12%
>>> > 454.calculix      39.7000    39.3000 -1.00%
>>> > 459.GemsFDTD      44.9000    45.0000 +0.22%
>>> > 465.tonto         29.8000    29.8000 +0%
>>> > 481.wrf           51.0000    51.2000 +0.39%
>>> > 482.sphinx3       69.8000    71.2000 +2.00%
>>
>> I see 471.omnetpp and 482.sphinx3 are in a similar ballpark and it
>> would be nice to catch the relevant case(s) with a cost model for
>> epilogue vectorization without masking first (to get rid of
>> --param vect-epilogues-nomask).
>>
>> As said elsewhere any non-conservative cost modeling (if the
>> number of scalar iterations is not statically constant) might
>> require versioning of the loop into a non-vectorized,
>> short-trip vectorized and regular vectorized case (the Intel
>> compiler does way more aggressive versioning IIRC).
>>
>> Richard.
>>
>>> > 3. epilogue vectorization using masking (4 benchmarks are not affected):
>>> > 400.perlbench     47.5000    46.8000 -1.47%
>>> > 401.bzip2         30.0000    29.9000 -0.33%
>>> > 403.gcc           42.3000    42.3000 +0%
>>> > 445.gobmk         32.1000    32.8000 +2.18%
>>> > 456.hmmer         32.0000    32.0000 +0%
>>> > 458.sjeng         36.1000    35.5000 -1.66%
>>> > 462.libquantum    81.1000    81.1000 +0%
>>> > 464.h264ref       65.4000    65.0000 -0.61%
>>> > 483.xalancbmk     49.4000    49.3000 -0.20%
>>> > 410.bwaves        95.9000    95.5000 -0.41%
>>> > 416.gamess        42.8000    42.6000 -0.46%
>>> > 433.milc          38.8000    39.1000 +0.77%
>>> > 434.zeusmp        52.1000    51.3000 -1.53%
>>> > 435.gromacs       32.9000    32.9000 +0%
>>> > 436.cactusADM     78.8000    85.3000 +8.24%
>>> > 437.leslie3d      55.4000    55.4000 +0%
>>> > 444.namd          31.3000    31.3000 +0%
>>> > 447.dealII        69.0000    69.2000 +0.28%
>>> > 450.soplex        47.7000    47.6000 -0.20%
>>> > 453.povray        62.2000    61.7000 -0.80%
>>> > 454.calculix      39.7000    38.2000 -3.77%
>>> > 459.GemsFDTD      44.9000    45.0000 +0.22%
>>> > 465.tonto         29.8000    29.9000 +0.33%
>>> > 481.wrf           51.2000    51.6000 +0.78%
>>> > 482.sphinx3       70.3000    65.4000 -6.97%
>>> >
>>> > There is a good speed-up for 436 but there is essential slow0down on 482, 454.
>>> >
>>> > So In general we don't have any advantages for AVX2.
>>> >
>>> > Best regards.
>>> > Yuri.
>>> >
>>> > P.S.
>>> > I  am not able to provide you with avx512 numbers because i don't have
>>> > an access to it.
>>> > Updated patch will be sent later.
>>> >
>>> > Best regards.
>>> > Yuri.
>>> >
>>> >
>>> > 2016-12-05 15:44 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>> >> Richard,
>>> >>
>>> >> Sorry, U sent you the bad assembly produced for loop with low trip
>>> >> count, here is the correct one:
>>> >>
>>> >> vmovdqa .LC0(%rip), %ymm0
>>> >> vpmaskmovd b(%rip), %ymm0, %ymm1
>>> >> vpmaskmovd c(%rip), %ymm0, %ymm2
>>> >> vpaddd %ymm2, %ymm1, %ymm1
>>> >> vpmaskmovd %ymm1, %ymm0, a(%rip)
>>> >>
>>> >> where .LC0 vector with all elements equal to -1 except for the last.
>>> >>
>>> >> Note also that additional option is required --param
>>> >> vect-short-loops=1 to do such conversion.
>>> >>
>>> >> Best regards.
>>> >> Yuri.
>>> >>
>>> >> 2016-12-02 18:59 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>> >>> Richard,
>>> >>>
>>> >>> Important clarification: the test I sent you with low trip count is
>>> >>> vectorized through masking only under
>>> >>> --param vect-epilogues-combine=1 -fvect-epilogue-cost-model=unlimited
>>> >>> for avx2. The laast option isrequired for avx2 since masked store has
>>> >>> big cost in comparison with masked load.
>>> >>>
>>> >>> Below is assemby produced for it:
>>> >>> vpcmpeqd %xmm0, %xmm0, %xmm0
>>> >>> vpmaskmovd b(%rip), %xmm0, %xmm1
>>> >>> vpmaskmovd c(%rip), %xmm0, %xmm2
>>> >>> vpaddd %xmm2, %xmm1, %xmm1
>>> >>> vpmaskmovd %xmm1, %xmm0, a(%rip)
>>> >>> ret
>>> >>>
>>> >>> Thanks.
>>> >>> Yuri.
>>> >>>
>>> >>>
>>> >>> 2016-12-02 18:49 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>> >>>> Richard,
>>> >>>>
>>> >>>> I have also question about low trip count loops.
>>> >>>> Did you mean that
>>> >>>> int a[128], b[128], c[128];
>>> >>>>
>>> >>>> void foo ()
>>> >>>> {
>>> >>>>   int i;
>>> >>>>   for (i = 0; i<7; i++)
>>> >>>>     a[i] = b[i] + c[i];
>>> >>>> }
>>> >>>>
>>> >>>> must be vectorizzed with masking without epilogue creation (e.g. for avx2)?
>>> >>>>
>>> >>>> Currently it vectorized with vector size 128. I also noticed that
>>> >>>> original Ilya patch does nothing for such masking.
>>> >>>>
>>> >>>> Thanks.
>>> >>>> Yuri.
>>> >>>>
>>> >>>> 2016-12-02 17:08 GMT+03:00 Yuri Rumyantsev <ysrumyan@gmail.com>:
>>> >>>>> Richard,
>>> >>>>>
>>> >>>>> You wrote:
>>> >>>>> I don't see _any_ cost model for vectorizing the epilogue with
>>> >>>>> masking?  Am I missing something?  A "trivial" cost model
>>> >>>>> should at least consider the additional IV(s), the mask
>>> >>>>> compute and the widening and narrowing ops required.
>>> >>>>>
>>> >>>>> I skipped all changes related to cost model assuming that one of the
>>> >>>>> next patch will contain all cost model changes.
>>> >>>>>
>>> >>>>> Should I include it to this patch?
>>> >>>>>
>>> >>>>> Thanks.
>>> >>>>> Yuri.
>>> >>>>>
>>> >>>>> 2016-12-01 17:45 GMT+03:00 Richard Biener <rguenther@suse.de>:
>>> >>>>>> On Thu, 1 Dec 2016, Yuri Rumyantsev wrote:
>>> >>>>>>
>>> >>>>>>> Thanks Richard for your comments.
>>> >>>>>>>
>>> >>>>>>> You asked me about possible performance improvements for AVX2 machines
>>> >>>>>>> - we did not see any visible speed-up for spec2k with any method of
>>> >>>>>>
>>> >>>>>> Spec 2000?  Can you check with SPEC 2006 or CPUv6?
>>> >>>>>>
>>> >>>>>> Did you see performance degradation?  What about compile-time and
>>> >>>>>> binary size effects?
>>> >>>>>>
>>> >>>>>>> masking, including epilogue masking and combining, only on AVX512
>>> >>>>>>> machine aka knl.
>>> >>>>>>
>>> >>>>>> I see.
>>> >>>>>>
>>> >>>>>> Note that as said in the initial review patch the cost model I
>>> >>>>>> saw therein looked flawed.  In the end I'd expect a sensible
>>> >>>>>> approach would be to do
>>> >>>>>>
>>> >>>>>>  if (n < scalar-most-profitable-niter)
>>> >>>>>>    {
>>> >>>>>>      no vectorization
>>> >>>>>>    }
>>> >>>>>>  else if (n < masking-more-profitable-than-not-masking-plus-epilogue)
>>> >>>>>>    {
>>> >>>>>>      do masked vectorization
>>> >>>>>>    }
>>> >>>>>>  else
>>> >>>>>>    {
>>> >>>>>>      do unmasked vectorization (with epilogue, eventually vectorized)
>>> >>>>>>    }
>>> >>>>>>
>>> >>>>>> where for short trip loops the else path would never be taken
>>> >>>>>> (statically).
>>> >>>>>>
>>> >>>>>> And yes, that means masking will only be useful for short-trip loops
>>> >>>>>> which in the end means an overall performance benfit is unlikely
>>> >>>>>> unless we have a lot of short-trip loops that are slow because of
>>> >>>>>> the overhead of main unmasked loop plus epilogue.
>>> >>>>>>
>>> >>>>>> Richard.
>>> >>>>>>
>>> >>>>>>> I will answer on your question later.
>>> >>>>>>>
>>> >>>>>>> Best regards.
>>> >>>>>>> Yuri
>>> >>>>>>>
>>> >>>>>>> 2016-12-01 14:33 GMT+03:00 Richard Biener <rguenther@suse.de>:
>>> >>>>>>> > On Mon, 28 Nov 2016, Yuri Rumyantsev wrote:
>>> >>>>>>> >
>>> >>>>>>> >> Richard!
>>> >>>>>>> >>
>>> >>>>>>> >> I attached vect dump for hte part of attached test-case which
>>> >>>>>>> >> illustrated how vectorization of epilogues works through masking:
>>> >>>>>>> >> #define SIZE 1023
>>> >>>>>>> >> #define ALIGN 64
>>> >>>>>>> >>
>>> >>>>>>> >> extern int posix_memalign(void **memptr, __SIZE_TYPE__ alignment,
>>> >>>>>>> >> __SIZE_TYPE__ size) __attribute__((weak));
>>> >>>>>>> >> extern void free (void *);
>>> >>>>>>> >>
>>> >>>>>>> >> void __attribute__((noinline))
>>> >>>>>>> >> test_citer (int * __restrict__ a,
>>> >>>>>>> >>    int * __restrict__ b,
>>> >>>>>>> >>    int * __restrict__ c)
>>> >>>>>>> >> {
>>> >>>>>>> >>   int i;
>>> >>>>>>> >>
>>> >>>>>>> >>   a = (int *)__builtin_assume_aligned (a, ALIGN);
>>> >>>>>>> >>   b = (int *)__builtin_assume_aligned (b, ALIGN);
>>> >>>>>>> >>   c = (int *)__builtin_assume_aligned (c, ALIGN);
>>> >>>>>>> >>
>>> >>>>>>> >>   for (i = 0; i < SIZE; i++)
>>> >>>>>>> >>     c[i] = a[i] + b[i];
>>> >>>>>>> >> }
>>> >>>>>>> >>
>>> >>>>>>> >> It was compiled with -mavx2 --param vect-epilogues-mask=1 options.
>>> >>>>>>> >>
>>> >>>>>>> >> I did not include in this patch vectorization of low trip-count loops
>>> >>>>>>> >> since in the original patch additional parameter was introduced:
>>> >>>>>>> >> +DEFPARAM (PARAM_VECT_SHORT_LOOPS,
>>> >>>>>>> >> +  "vect-short-loops",
>>> >>>>>>> >> +  "Enable vectorization of low trip count loops using masking.",
>>> >>>>>>> >> +  0, 0, 1)
>>> >>>>>>> >>
>>> >>>>>>> >> I assume that this ability can be included very quickly but it
>>> >>>>>>> >> requires cost model enhancements also.
>>> >>>>>>> >
>>> >>>>>>> > Comments on the patch itself (as I'm having a closer look again,
>>> >>>>>>> > I know how it vectorizes the above but I wondered why epilogue
>>> >>>>>>> > and short-trip loops are not basically the same code path).
>>> >>>>>>> >
>>> >>>>>>> > Btw, I don't like that the features are behind a --param paywall.
>>> >>>>>>> > That just means a) nobody will use it, b) it will bit-rot quickly,
>>> >>>>>>> > c) bugs are well-hidden.
>>> >>>>>>> >
>>> >>>>>>> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>>> >>>>>>> > +      && integer_zerop (nested_in_vect_loop
>>> >>>>>>> > +                       ? STMT_VINFO_DR_STEP (stmt_info)
>>> >>>>>>> > +                       : DR_STEP (dr)))
>>> >>>>>>> > +    {
>>> >>>>>>> > +      if (dump_enabled_p ())
>>> >>>>>>> > +       dump_printf_loc (MSG_NOTE, vect_location,
>>> >>>>>>> > +                        "allow invariant load for masked loop.\n");
>>> >>>>>>> > +    }
>>> >>>>>>> >
>>> >>>>>>> > this can test memory_access_type == VMAT_INVARIANT.  Please put
>>> >>>>>>> > all the checks in a common
>>> >>>>>>> >
>>> >>>>>>> >   if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>>> >>>>>>> >     {
>>> >>>>>>> >        if (memory_access_type == VMAT_INVARIANT)
>>> >>>>>>> >          {
>>> >>>>>>> >          }
>>> >>>>>>> >        else if (...)
>>> >>>>>>> >          {
>>> >>>>>>> >             LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>>> >>>>>>> >          }
>>> >>>>>>> >        else if (..)
>>> >>>>>>> > ...
>>> >>>>>>> >     }
>>> >>>>>>> >
>>> >>>>>>> > @@ -6667,6 +6756,15 @@ vectorizable_load (gimple *stmt,
>>> >>>>>>> > gimple_stmt_iterator *gsi, gimple **vec_stmt,
>>> >>>>>>> >        gcc_assert (!nested_in_vect_loop);
>>> >>>>>>> >        gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
>>> >>>>>>> >
>>> >>>>>>> > +      if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>>> >>>>>>> > +       {
>>> >>>>>>> > +         if (dump_enabled_p ())
>>> >>>>>>> > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>> >>>>>>> > +                            "cannot be masked: grouped access is not"
>>> >>>>>>> > +                            " supported.");
>>> >>>>>>> > +         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>>> >>>>>>> > +      }
>>> >>>>>>> > +
>>> >>>>>>> >
>>> >>>>>>> > isn't this already handled by the above?  Or rather the general
>>> >>>>>>> > disallowance of SLP?
>>> >>>>>>> >
>>> >>>>>>> > @@ -5730,6 +5792,24 @@ vectorizable_store (gimple *stmt,
>>> >>>>>>> > gimple_stmt_iterator *gsi, gimple **vec_stmt,
>>> >>>>>>> >                             &memory_access_type, &gs_info))
>>> >>>>>>> >      return false;
>>> >>>>>>> >
>>> >>>>>>> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>>> >>>>>>> > +      && memory_access_type != VMAT_CONTIGUOUS)
>>> >>>>>>> > +    {
>>> >>>>>>> > +      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>>> >>>>>>> > +      if (dump_enabled_p ())
>>> >>>>>>> > +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>> >>>>>>> > +                        "cannot be masked: unsupported memory access
>>> >>>>>>> > type.\n");
>>> >>>>>>> > +    }
>>> >>>>>>> > +
>>> >>>>>>> > +  if (loop_vinfo && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>>> >>>>>>> > +      && !can_mask_load_store (stmt))
>>> >>>>>>> > +    {
>>> >>>>>>> > +      LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>>> >>>>>>> > +      if (dump_enabled_p ())
>>> >>>>>>> > +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>> >>>>>>> > +                        "cannot be masked: unsupported masked store.\n");
>>> >>>>>>> > +    }
>>> >>>>>>> > +
>>> >>>>>>> >
>>> >>>>>>> > likewise please combine the ifs.
>>> >>>>>>> >
>>> >>>>>>> > @@ -2354,7 +2401,10 @@ vectorizable_mask_load_store (gimple *stmt,
>>> >>>>>>> > gimple_stmt_iterator *gsi,
>>> >>>>>>> >                                           ptr, vec_mask, vec_rhs);
>>> >>>>>>> >           vect_finish_stmt_generation (stmt, new_stmt, gsi);
>>> >>>>>>> >           if (i == 0)
>>> >>>>>>> > -           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
>>> >>>>>>> > +           {
>>> >>>>>>> > +             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
>>> >>>>>>> > +             STMT_VINFO_FIRST_COPY_P (vinfo_for_stmt (new_stmt)) = true;
>>> >>>>>>> > +           }
>>> >>>>>>> >           else
>>> >>>>>>> >             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
>>> >>>>>>> >           prev_stmt_info = vinfo_for_stmt (new_stmt);
>>> >>>>>>> >
>>> >>>>>>> > here you only set the flag, elsewhere you copy DR and VECTYPE as well.
>>> >>>>>>> >
>>> >>>>>>> > @@ -2113,6 +2146,20 @@ vectorizable_mask_load_store (gimple *stmt,
>>> >>>>>>> > gimple_stmt_iterator *gsi,
>>> >>>>>>> >                && !useless_type_conversion_p (vectype, rhs_vectype)))
>>> >>>>>>> >      return false;
>>> >>>>>>> >
>>> >>>>>>> > +  if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>>> >>>>>>> > +    {
>>> >>>>>>> > +      /* Check that mask conjuction is supported.  */
>>> >>>>>>> > +      optab tab;
>>> >>>>>>> > +      tab = optab_for_tree_code (BIT_AND_EXPR, vectype, optab_default);
>>> >>>>>>> > +      if (!tab || optab_handler (tab, TYPE_MODE (vectype)) ==
>>> >>>>>>> > CODE_FOR_nothing)
>>> >>>>>>> > +       {
>>> >>>>>>> > +         if (dump_enabled_p ())
>>> >>>>>>> > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>> >>>>>>> > +                            "cannot be masked: unsupported mask
>>> >>>>>>> > operation\n");
>>> >>>>>>> > +         LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = false;
>>> >>>>>>> > +       }
>>> >>>>>>> > +    }
>>> >>>>>>> >
>>> >>>>>>> > does this really test whether we can bit-and the mask?  You are
>>> >>>>>>> > using the vector type of the store (which might be V2DF for example),
>>> >>>>>>> > also for AVX512 it might be a vector-bool type with integer mode?
>>> >>>>>>> > Of course we maybe can simply assume mask conjunction is available
>>> >>>>>>> > (I know no ISA where that would be not true).
>>> >>>>>>> >
>>> >>>>>>> > +/* Return true if STMT can be converted to masked form.  */
>>> >>>>>>> > +
>>> >>>>>>> > +static bool
>>> >>>>>>> > +can_mask_load_store (gimple *stmt)
>>> >>>>>>> > +{
>>> >>>>>>> > +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>>> >>>>>>> > +  tree vectype, mask_vectype;
>>> >>>>>>> > +  tree lhs, ref;
>>> >>>>>>> > +
>>> >>>>>>> > +  if (!stmt_info)
>>> >>>>>>> > +    return false;
>>> >>>>>>> > +  lhs = gimple_assign_lhs (stmt);
>>> >>>>>>> > +  ref = (TREE_CODE (lhs) == SSA_NAME) ? gimple_assign_rhs1 (stmt) : lhs;
>>> >>>>>>> > +  if (may_be_nonaddressable_p (ref))
>>> >>>>>>> > +    return false;
>>> >>>>>>> > +  vectype = STMT_VINFO_VECTYPE (stmt_info);
>>> >>>>>>> >
>>> >>>>>>> > You probably modeled this after ifcvt_can_use_mask_load_store but I
>>> >>>>>>> > don't think checking may_be_nonaddressable_p is necessary (we couldn't
>>> >>>>>>> > even vectorize such refs).  stmt_info should never be NULL either.
>>> >>>>>>> > With the check removed tree-ssa-loop-ivopts.h should no longer be
>>> >>>>>>> > necessary.
>>> >>>>>>> >
>>> >>>>>>> > +static void
>>> >>>>>>> > +vect_mask_load_store_stmt (gimple *stmt, tree vectype, tree mask,
>>> >>>>>>> > +                          data_reference *dr, gimple_stmt_iterator *si)
>>> >>>>>>> > +{
>>> >>>>>>> > ...
>>> >>>>>>> > +  addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (mem),
>>> >>>>>>> > +                                  true, NULL_TREE, true,
>>> >>>>>>> > +                                  GSI_SAME_STMT);
>>> >>>>>>> > +
>>> >>>>>>> > +  align = TYPE_ALIGN_UNIT (vectype);
>>> >>>>>>> > +  if (aligned_access_p (dr))
>>> >>>>>>> > +    misalign = 0;
>>> >>>>>>> > +  else if (DR_MISALIGNMENT (dr) == -1)
>>> >>>>>>> > +    {
>>> >>>>>>> > +      align = TYPE_ALIGN_UNIT (elem_type);
>>> >>>>>>> > +      misalign = 0;
>>> >>>>>>> > +    }
>>> >>>>>>> > +  else
>>> >>>>>>> > +    misalign = DR_MISALIGNMENT (dr);
>>> >>>>>>> > +  set_ptr_info_alignment (get_ptr_info (addr), align, misalign);
>>> >>>>>>> > +  ptr = build_int_cst (reference_alias_ptr_type (mem),
>>> >>>>>>> > +                      misalign ? misalign & -misalign : align);
>>> >>>>>>> >
>>> >>>>>>> > you should simply use
>>> >>>>>>> >
>>> >>>>>>> >   align = get_object_alignment (mem) / BITS_PER_UNIT;
>>> >>>>>>> >
>>> >>>>>>> > here rather than trying to be clever.  Eventually you don't need
>>> >>>>>>> > the DR then (see question above).
>>> >>>>>>> >
>>> >>>>>>> > +    }
>>> >>>>>>> > +  gsi_replace (si ? si : &gsi, new_stmt, false);
>>> >>>>>>> >
>>> >>>>>>> > when you replace the load/store please previously copy VUSE and VDEF
>>> >>>>>>> > from the original one (we were nearly clean enough to no longer
>>> >>>>>>> > require a virtual operand rewrite after vectorization...)  Thus
>>> >>>>>>> >
>>> >>>>>>> >   gimple_set_vuse (new_stmt, gimple_vuse (stmt));
>>> >>>>>>> >   gimple_set_vdef (new_stmt, gimple_vdef (stmt));
>>> >>>>>>> >
>>> >>>>>>> > +static void
>>> >>>>>>> > +vect_mask_loop (loop_vec_info loop_vinfo)
>>> >>>>>>> > +{
>>> >>>>>>> > ...
>>> >>>>>>> > +  /* Scan all loop statements to convert vector load/store including
>>> >>>>>>> > masked
>>> >>>>>>> > +     form.  */
>>> >>>>>>> > +  for (unsigned i = 0; i < loop->num_nodes; i++)
>>> >>>>>>> > +    {
>>> >>>>>>> > +      basic_block bb = bbs[i];
>>> >>>>>>> > +      for (gimple_stmt_iterator si = gsi_start_bb (bb);
>>> >>>>>>> > +          !gsi_end_p (si); gsi_next (&si))
>>> >>>>>>> > +       {
>>> >>>>>>> > +         gimple *stmt = gsi_stmt (si);
>>> >>>>>>> > +         stmt_vec_info stmt_info = NULL;
>>> >>>>>>> > +         tree vectype = NULL;
>>> >>>>>>> > +         data_reference *dr;
>>> >>>>>>> > +
>>> >>>>>>> > +         /* Mask load case.  */
>>> >>>>>>> > +         if (is_gimple_call (stmt)
>>> >>>>>>> > +             && gimple_call_internal_p (stmt)
>>> >>>>>>> > +             && gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
>>> >>>>>>> > +             && !VECTOR_TYPE_P (TREE_TYPE (gimple_call_arg (stmt, 2))))
>>> >>>>>>> > +           {
>>> >>>>>>> > ...
>>> >>>>>>> > +             /* Skip invariant loads.  */
>>> >>>>>>> > +             if (integer_zerop (nested_in_vect_loop_p (loop, stmt)
>>> >>>>>>> > +                                ? STMT_VINFO_DR_STEP (stmt_info)
>>> >>>>>>> > +                                : DR_STEP (STMT_VINFO_DATA_REF
>>> >>>>>>> > (stmt_info))))
>>> >>>>>>> > +               continue;
>>> >>>>>>> >
>>> >>>>>>> > seeing this it would be nice if stmt_info had a flag for whether
>>> >>>>>>> > the stmt needs masking (and a flag on wheter this is a scalar or a
>>> >>>>>>> > vectorized stmt).
>>> >>>>>>> >
>>> >>>>>>> > +         /* Skip hoisted out statements.  */
>>> >>>>>>> > +         if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
>>> >>>>>>> > +           continue;
>>> >>>>>>> >
>>> >>>>>>> > err, you walk stmts in the loop!  Isn't this covered by the above
>>> >>>>>>> > skipping of 'invariant loads'?
>>> >>>>>>> >
>>> >>>>>>> > +static gimple *
>>> >>>>>>> > +vect_mask_reduction_stmt (gimple *stmt, tree mask, gimple *prev)
>>> >>>>>>> > +{
>>> >>>>>>> >
>>> >>>>>>> > depending on the reduction operand there are variants that
>>> >>>>>>> > could get away w/o the VEC_COND_EXPR, like
>>> >>>>>>> >
>>> >>>>>>> >   S1': tem_4 = d_3 & MASK;
>>> >>>>>>> >   S2': r_1 = r_2 + tem_4;
>>> >>>>>>> >
>>> >>>>>>> > which works for plus at least.  More generally doing
>>> >>>>>>> >
>>> >>>>>>> >   S1': tem_4 = VEC_COND_EXPR<MASK, d_3, neutral operand>
>>> >>>>>>> >   S2': r_1 = r_2 OP tem_4;
>>> >>>>>>> >
>>> >>>>>>> > and leaving optimization to & to later opts (& won't work for
>>> >>>>>>> > AVX512 mask registers I guess).
>>> >>>>>>> >
>>> >>>>>>> > Good enough for later enhacement of course.
>>> >>>>>>> >
>>> >>>>>>> > +static void
>>> >>>>>>> > +vect_gen_ivs_for_masking (loop_vec_info loop_vinfo, vec<tree> *ivs)
>>> >>>>>>> > +{
>>> >>>>>>> > ...
>>> >>>>>>> >
>>> >>>>>>> > isn't it enough to always create a single IV and derive the
>>> >>>>>>> > additional copies by IV + i * { elems, elems, elems ... }?
>>> >>>>>>> > IVs are expensive -- I'm sure we can optimize the rest of the
>>> >>>>>>> > scheme further as well but this one looks obvious to me.
>>> >>>>>>> >
>>> >>>>>>> > @@ -3225,12 +3508,32 @@ vect_estimate_min_profitable_iters (loop_vec_info
>>> >>>>>>> > loop_vinfo,
>>> >>>>>>> >    int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
>>> >>>>>>> >    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
>>> >>>>>>> >
>>> >>>>>>> > +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>>> >>>>>>> > +    {
>>> >>>>>>> > +      /* Currently we don't produce scalar epilogue version in case
>>> >>>>>>> > +        its masked version is provided.  It means we don't need to
>>> >>>>>>> > +        compute profitability one more time here.  Just make a
>>> >>>>>>> > +        masked loop version.  */
>>> >>>>>>> > +      if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>>> >>>>>>> > +         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
>>> >>>>>>> > +       {
>>> >>>>>>> > +         dump_printf_loc (MSG_NOTE, vect_location,
>>> >>>>>>> > +                          "cost model: mask loop epilogue.\n");
>>> >>>>>>> > +         LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
>>> >>>>>>> > +         *ret_min_profitable_niters = 0;
>>> >>>>>>> > +         *ret_min_profitable_estimate = 0;
>>> >>>>>>> > +         return;
>>> >>>>>>> > +       }
>>> >>>>>>> > +    }
>>> >>>>>>> >    /* Cost model disabled.  */
>>> >>>>>>> > -  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>> >>>>>>> > +  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>> >>>>>>> >      {
>>> >>>>>>> >        dump_printf_loc (MSG_NOTE, vect_location, "cost model
>>> >>>>>>> > disabled.\n");
>>> >>>>>>> >        *ret_min_profitable_niters = 0;
>>> >>>>>>> >        *ret_min_profitable_estimate = 0;
>>> >>>>>>> > +      if (PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK)
>>> >>>>>>> > +         && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>>> >>>>>>> > +       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
>>> >>>>>>> >        return;
>>> >>>>>>> >      }
>>> >>>>>>> >
>>> >>>>>>> > the unlimited_cost_model case should come first?  OTOH masking or
>>> >>>>>>> > not is probably not sth covered by 'unlimited' - that is about
>>> >>>>>>> > vectorizing or not.  But the above code means that for
>>> >>>>>>> > epilogue vectorization w/o masking we ignore unlimited_cost_model ()?
>>> >>>>>>> > That doesn't make sense to me.
>>> >>>>>>> >
>>> >>>>>>> > Plus if this is short-trip or epilogue vectorization and the
>>> >>>>>>> > cost model is _not_ unlimited then we dont' want to enable
>>> >>>>>>> > masking always (if it is possible).  It might be we statically
>>> >>>>>>> > know the epilogue executes for at most two iterations for example.
>>> >>>>>>> >
>>> >>>>>>> > I don't see _any_ cost model for vectorizing the epilogue with
>>> >>>>>>> > masking?  Am I missing something?  A "trivial" cost model
>>> >>>>>>> > should at least consider the additional IV(s), the mask
>>> >>>>>>> > compute and the widening and narrowing ops required.
>>> >>>>>>> >
>>> >>>>>>> > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
>>> >>>>>>> > index e13d6a2..36be342 100644
>>> >>>>>>> > --- a/gcc/tree-vect-loop-manip.c
>>> >>>>>>> > +++ b/gcc/tree-vect-loop-manip.c
>>> >>>>>>> > @@ -1635,6 +1635,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
>>> >>>>>>> > niters, tree nitersm1,
>>> >>>>>>> >    bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
>>> >>>>>>> >                          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
>>> >>>>>>> >
>>> >>>>>>> > +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>>> >>>>>>> > +    {
>>> >>>>>>> > +      prolog_peeling = false;
>>> >>>>>>> > +      if (LOOP_VINFO_MASK_LOOP (loop_vinfo))
>>> >>>>>>> > +       epilog_peeling = false;
>>> >>>>>>> > +    }
>>> >>>>>>> > +
>>> >>>>>>> >    if (!prolog_peeling && !epilog_peeling)
>>> >>>>>>> >      return NULL;
>>> >>>>>>> >
>>> >>>>>>> > I think the prolog_peeling was fixed during the epilogue vectorization
>>> >>>>>>> > review and should no longer be necessary.  Please add
>>> >>>>>>> > a && ! LOOP_VINFO_MASK_LOOP () to the epilog_peeling init instead
>>> >>>>>>> > (it should also work for short-trip loop vectorization).
>>> >>>>>>> >
>>> >>>>>>> > @@ -2022,11 +2291,18 @@ start_over:
>>> >>>>>>> >        || (max_niter != -1
>>> >>>>>>> >           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
>>> >>>>>>> >      {
>>> >>>>>>> > -      if (dump_enabled_p ())
>>> >>>>>>> > -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>> >>>>>>> > -                        "not vectorized: iteration count smaller than "
>>> >>>>>>> > -                        "vectorization factor.\n");
>>> >>>>>>> > -      return false;
>>> >>>>>>> > +      /* Allow low trip count for loop epilogue we want to mask.  */
>>> >>>>>>> > +      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>>> >>>>>>> > +         && PARAM_VALUE (PARAM_VECT_EPILOGUES_MASK))
>>> >>>>>>> > +       LOOP_VINFO_MASK_LOOP (loop_vinfo) = true;
>>> >>>>>>> > +      else
>>> >>>>>>> > +       {
>>> >>>>>>> > +         if (dump_enabled_p ())
>>> >>>>>>> >
>>> >>>>>>> > so why do we test only LOOP_VINFO_EPILOGUE_P here?  All the code
>>> >>>>>>> > I saw sofar would also work for the main loop (but the cost
>>> >>>>>>> > model is missing).
>>> >>>>>>> >
>>> >>>>>>> > I am missing testcases.  There's only a single one but we should
>>> >>>>>>> > have cases covering all kinds of mask IV widths and widen/shorten
>>> >>>>>>> > masks.
>>> >>>>>>> >
>>> >>>>>>> > Do you have any numbers on SPEC 2k6 with epilogue vect and/or masking
>>> >>>>>>> > enabled for an AVX2 machine?
>>> >>>>>>> >
>>> >>>>>>> > Oh, and I really dislike the --param paywall.
>>> >>>>>>> >
>>> >>>>>>> > Thanks,
>>> >>>>>>> > Richard.
>>> >>>>>>> >
>>> >>>>>>> >> Best regards.
>>> >>>>>>> >> Yuri.
>>> >>>>>>> >>
>>> >>>>>>> >>
>>> >>>>>>> >> 2016-11-28 17:39 GMT+03:00 Richard Biener <rguenther@suse.de>:
>>> >>>>>>> >> > On Thu, 24 Nov 2016, Yuri Rumyantsev wrote:
>>> >>>>>>> >> >
>>> >>>>>>> >> >> Hi All,
>>> >>>>>>> >> >>
>>> >>>>>>> >> >> Here is the second patch which supports epilogue vectorization using
>>> >>>>>>> >> >> masking without cost model. Currently it is possible
>>> >>>>>>> >> >> only with passing parameter "--param vect-epilogues-mask=1".
>>> >>>>>>> >> >>
>>> >>>>>>> >> >> Bootstrapping and regression testing did not show any new regression.
>>> >>>>>>> >> >>
>>> >>>>>>> >> >> Any comments will be appreciated.
>>> >>>>>>> >> >
>>> >>>>>>> >> > Going over the patch the main question is one how it works -- it looks
>>> >>>>>>> >> > like the decision whether to vectorize & mask the epilogue is made
>>> >>>>>>> >> > when vectorizing the loop that generates the epilogue rather than
>>> >>>>>>> >> > in the epilogue vectorization path?
>>> >>>>>>> >> >
>>> >>>>>>> >> > That is, I'd have expected to see this handling low-trip count loops
>>> >>>>>>> >> > by masking?  And thus masking the epilogue simply by it being
>>> >>>>>>> >> > low-trip count?
>>> >>>>>>> >> >
>>> >>>>>>> >> > Richard.
>>> >>>>>>> >> >
>>> >>>>>>> >> >> ChangeLog:
>>> >>>>>>> >> >> 2016-11-24  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>> >>>>>>> >> >>
>>> >>>>>>> >> >> * params.def (PARAM_VECT_EPILOGUES_MASK): New.
>>> >>>>>>> >> >> * tree-vect-data-refs.c (vect_get_new_ssa_name): Support vect_mask_var.
>>> >>>>>>> >> >> * tree-vect-loop.c: Include insn-config.h, recog.h and alias.h.
>>> >>>>>>> >> >> (new_loop_vec_info): Add zeroing can_be_masked, mask_loop and
>>> >>>>>>> >> >> required_mask fields.
>>> >>>>>>> >> >> (vect_check_required_masks_widening): New.
>>> >>>>>>> >> >> (vect_check_required_masks_narrowing): New.
>>> >>>>>>> >> >> (vect_get_masking_iv_elems): New.
>>> >>>>>>> >> >> (vect_get_masking_iv_type): New.
>>> >>>>>>> >> >> (vect_get_extreme_masks): New.
>>> >>>>>>> >> >> (vect_check_required_masks): New.
>>> >>>>>>> >> >> (vect_analyze_loop_operations): Call vect_check_required_masks if all
>>> >>>>>>> >> >> statements can be masked.
>>> >>>>>>> >> >> (vect_analyze_loop_2): Inititalize to zero min_scalar_loop_bound.
>>> >>>>>>> >> >> Add check that epilogue can be masked with the same vf with issue
>>> >>>>>>> >> >> fail notes.  Allow epilogue vectorization through masking of low trip
>>> >>>>>>> >> >> loops. Set to true can_be_masked field before loop operation analysis.
>>> >>>>>>> >> >> Do not set-up min_scalar_loop_bound for epilogue vectorization through
>>> >>>>>>> >> >> masking.  Do not peeling for epilogue masking.  Reset can_be_masked
>>> >>>>>>> >> >> field before repeat analysis.
>>> >>>>>>> >> >> (vect_estimate_min_profitable_iters): Do not compute profitability
>>> >>>>>>> >> >> for epilogue masking.  Set up mask_loop filed to true if parameter
>>> >>>>>>> >> >> PARAM_VECT_EPILOGUES_MASK is non-zero.
>>> >>>>>>> >> >> (vectorizable_reduction): Add check that statement can be masked.
>>> >>>>>>> >> >> (vectorizable_induction): Do not support masking for induction.
>>> >>>>>>> >> >> (vect_gen_ivs_for_masking): New.
>>> >>>>>>> >> >> (vect_get_mask_index_for_elems): New.
>>> >>>>>>> >> >> (vect_get_mask_index_for_type): New.
>>> >>>>>>> >> >> (vect_create_narrowed_masks): New.
>>> >>>>>>> >> >> (vect_create_widened_masks): New.
>>> >>>>>>> >> >> (vect_gen_loop_masks): New.
>>> >>>>>>> >> >> (vect_mask_reduction_stmt): New.
>>> >>>>>>> >> >> (vect_mask_mask_load_store_stmt): New.
>>> >>>>>>> >> >> (vect_mask_load_store_stmt): New.
>>> >>>>>>> >> >> (vect_mask_loop): New.
>>> >>>>>>> >> >> (vect_transform_loop): Invoke vect_mask_loop if required.
>>> >>>>>>> >> >> Use div_ceil to recompute upper bounds for masked loops.  Issue
>>> >>>>>>> >> >> statistics for epilogue vectorization through masking. Do not reduce
>>> >>>>>>> >> >> vf for masking epilogue.
>>> >>>>>>> >> >> * tree-vect-stmts.c: Include tree-ssa-loop-ivopts.h.
>>> >>>>>>> >> >> (can_mask_load_store): New.
>>> >>>>>>> >> >> (vectorizable_mask_load_store): Check that mask conjuction is
>>> >>>>>>> >> >> supported.  Set-up first_copy_p field of stmt_vinfo.
>>> >>>>>>> >> >> (vectorizable_simd_clone_call): Check that simd clone can not be
>>> >>>>>>> >> >> masked.
>>> >>>>>>> >> >> (vectorizable_store): Check that store can be masked. Mark the first
>>> >>>>>>> >> >> copy of generated vector stores and provide it with vectype and the
>>> >>>>>>> >> >> original data reference.
>>> >>>>>>> >> >> (vectorizable_load): Check that load can be masked.
>>> >>>>>>> >> >> (vect_stmt_should_be_masked_for_epilogue): New.
>>> >>>>>>> >> >> (vect_add_required_mask_for_stmt): New.
>>> >>>>>>> >> >> (vect_analyze_stmt): Add check on unsupported statements for masking
>>> >>>>>>> >> >> with printing message.
>>> >>>>>>> >> >> * tree-vectorizer.h (struct _loop_vec_info): Add new fields
>>> >>>>>>> >> >> can_be_maske, required_masks, masl_loop.
>>> >>>>>>> >> >> (LOOP_VINFO_CAN_BE_MASKED): New.
>>> >>>>>>> >> >> (LOOP_VINFO_REQUIRED_MASKS): New.
>>> >>>>>>> >> >> (LOOP_VINFO_MASK_LOOP): New.
>>> >>>>>>> >> >> (struct _stmt_vec_info): Add first_copy_p field.
>>> >>>>>>> >> >> (STMT_VINFO_FIRST_COPY_P): New.
>>> >>>>>>> >> >>
>>> >>>>>>> >> >> gcc/testsuite/
>>> >>>>>>> >> >>
>>> >>>>>>> >> >> * gcc.dg/vect/vect-tail-mask-1.c: New test.
>>> >>>>>>> >> >>
>>> >>>>>>> >> >> 2016-11-18 18:54 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
>>> >>>>>>> >> >> > On 18 November 2016 at 16:46, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> >>>>>>> >> >> >> It is very strange that this test failed on arm, since it requires
>>> >>>>>>> >> >> >> target avx2 to check vectorizer dumps:
>>> >>>>>>> >> >> >>
>>> >>>>>>> >> >> >> /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" {
>>> >>>>>>> >> >> >> target avx2_runtime } } } */
>>> >>>>>>> >> >> >> /* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED
>>> >>>>>>> >> >> >> \\(VS=16\\)" 2 "vect" { target avx2_runtime } } } */
>>> >>>>>>> >> >> >>
>>> >>>>>>> >> >> >> Could you please clarify what is the reason of the failure?
>>> >>>>>>> >> >> >
>>> >>>>>>> >> >> > It's not the scan-dumps that fail, but the execution.
>>> >>>>>>> >> >> > The test calls abort() for some reason.
>>> >>>>>>> >> >> >
>>> >>>>>>> >> >> > It will take me a while to rebuild the test manually in the right
>>> >>>>>>> >> >> > debug environment to provide you with more traces.
>>> >>>>>>> >> >> >
>>> >>>>>>> >> >> >
>>> >>>>>>> >> >> >
>>> >>>>>>> >> >> >>
>>> >>>>>>> >> >> >> Thanks.
>>> >>>>>>> >> >> >>
>>> >>>>>>> >> >> >> 2016-11-18 16:20 GMT+03:00 Christophe Lyon <christophe.lyon@linaro.org>:
>>> >>>>>>> >> >> >>> On 15 November 2016 at 15:41, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> >>>>>>> >> >> >>>> Hi All,
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> Here is patch for non-masked epilogue vectoriziation.
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> Bootstrap and regression testing did not show any new failures.
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> Is it OK for trunk?
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> Thanks.
>>> >>>>>>> >> >> >>>> Changelog:
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> 2016-11-15  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> * params.def (PARAM_VECT_EPILOGUES_NOMASK): New.
>>> >>>>>>> >> >> >>>> * tree-if-conv.c (tree_if_conversion): Make public.
>>> >>>>>>> >> >> >>>> * * tree-if-conv.h: New file.
>>> >>>>>>> >> >> >>>> * tree-vect-data-refs.c (vect_analyze_data_ref_dependences) Avoid
>>> >>>>>>> >> >> >>>> dynamic alias checks for epilogues.
>>> >>>>>>> >> >> >>>> * tree-vect-loop-manip.c (vect_do_peeling): Return created epilog.
>>> >>>>>>> >> >> >>>> * tree-vect-loop.c: include tree-if-conv.h.
>>> >>>>>>> >> >> >>>> (new_loop_vec_info): Add zeroing orig_loop_info field.
>>> >>>>>>> >> >> >>>> (vect_analyze_loop_2): Don't try to enhance alignment for epilogues.
>>> >>>>>>> >> >> >>>> (vect_analyze_loop): Add argument ORIG_LOOP_INFO which is not NULL
>>> >>>>>>> >> >> >>>> if epilogue is vectorized, set up orig_loop_info field of loop_vinfo
>>> >>>>>>> >> >> >>>> using passed argument.
>>> >>>>>>> >> >> >>>> (vect_transform_loop): Check if created epilogue should be returned
>>> >>>>>>> >> >> >>>> for further vectorization with less vf.  If-convert epilogue if
>>> >>>>>>> >> >> >>>> required. Print vectorization success for epilogue.
>>> >>>>>>> >> >> >>>> * tree-vectorizer.c (vectorize_loops): Add epilogue vectorization
>>> >>>>>>> >> >> >>>> if it is required, pass loop_vinfo produced during vectorization of
>>> >>>>>>> >> >> >>>> loop body to vect_analyze_loop.
>>> >>>>>>> >> >> >>>> * tree-vectorizer.h (struct _loop_vec_info): Add new field
>>> >>>>>>> >> >> >>>> orig_loop_info.
>>> >>>>>>> >> >> >>>> (LOOP_VINFO_ORIG_LOOP_INFO): New.
>>> >>>>>>> >> >> >>>> (LOOP_VINFO_EPILOGUE_P): New.
>>> >>>>>>> >> >> >>>> (LOOP_VINFO_ORIG_VECT_FACTOR): New.
>>> >>>>>>> >> >> >>>> (vect_do_peeling): Change prototype to return epilogue.
>>> >>>>>>> >> >> >>>> (vect_analyze_loop): Add argument of loop_vec_info type.
>>> >>>>>>> >> >> >>>> (vect_transform_loop): Return created loop.
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> gcc/testsuite/
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> * lib/target-supports.exp (check_avx2_hw_available): New.
>>> >>>>>>> >> >> >>>> (check_effective_target_avx2_runtime): New.
>>> >>>>>>> >> >> >>>> * gcc.dg/vect/vect-tail-nomask-1.c: New test.
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>
>>> >>>>>>> >> >> >>> Hi,
>>> >>>>>>> >> >> >>>
>>> >>>>>>> >> >> >>> This new test fails on arm-none-eabi (using default cpu/fpu/mode):
>>> >>>>>>> >> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c -flto -ffat-lto-objects execution test
>>> >>>>>>> >> >> >>>   gcc.dg/vect/vect-tail-nomask-1.c execution test
>>> >>>>>>> >> >> >>>
>>> >>>>>>> >> >> >>> It does pass on the same target if configured --with-cpu=cortex-a9.
>>> >>>>>>> >> >> >>>
>>> >>>>>>> >> >> >>> Christophe
>>> >>>>>>> >> >> >>>
>>> >>>>>>> >> >> >>>
>>> >>>>>>> >> >> >>>
>>> >>>>>>> >> >> >>>>
>>> >>>>>>> >> >> >>>> 2016-11-14 20:04 GMT+03:00 Richard Biener <rguenther@suse.de>:
>>> >>>>>>> >> >> >>>>> On November 14, 2016 4:39:40 PM GMT+01:00, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>>> >>>>>>> >> >> >>>>>>Richard,
>>> >>>>>>> >> >> >>>>>>
>>> >>>>>>> >> >> >>>>>>I checked one of the tests designed for epilogue vectorization using
>>> >>>>>>> >> >> >>>>>>patches 1 - 3 and found out that build compiler performs vectorization
>>> >>>>>>> >> >> >>>>>>of epilogues with --param vect-epilogues-nomask=1 passed:
>>> >>>>>>> >> >> >>>>>>
>>> >>>>>>> >> >> >>>>>>$ gcc -Ofast -mavx2 t1.c -S --param vect-epilogues-nomask=1 -o
>>> >>>>>>> >> >> >>>>>>t1.new-nomask.s -fdump-tree-vect-details
>>> >>>>>>> >> >> >>>>>>$ grep VECTORIZED -c t1.c.156t.vect
>>> >>>>>>> >> >> >>>>>>4
>>> >>>>>>> >> >> >>>>>> Without param only 2 loops are vectorized.
>>> >>>>>>> >> >> >>>>>>
>>> >>>>>>> >> >> >>>>>>Should I simply add a part of tests related to this feature or I must
>>> >>>>>>> >> >> >>>>>>delete all not necessary changes also?
>>> >>>>>>> >> >> >>>>>
>>> >>>>>>> >> >> >>>>> Please remove all not necessary changes.
>>> >>>>>>> >> >> >>>>>
>>> >>>>>>> >> >> >>>>> Richard.
>>> >>>>>>> >> >> >>>>>
>>> >>>>>>> >> >> >>>>>>Thanks.
>>> >>>>>>> >> >> >>>>>>Yuri.
>>> >>>>>>> >> >> >>>>>>
>>> >>>>>>> >> >> >>>>>>2016-11-14 16:40 GMT+03:00 Richard Biener <rguenther@suse.de>:
>>> >>>>>>> >> >> >>>>>>> On Mon, 14 Nov 2016, Yuri Rumyantsev wrote:
>>> >>>>>>> >> >> >>>>>>>
>>> >>>>>>> >> >> >>>>>>>> Richard,
>>> >>>>>>> >> >> >>>>>>>>
>>> >>>>>>> >> >> >>>>>>>> In my previous patch I forgot to remove couple lines related to aux
>>> >>>>>>> >> >> >>>>>>field.
>>> >>>>>>> >> >> >>>>>>>> Here is the correct updated patch.
>>> >>>>>>> >> >> >>>>>>>
>>> >>>>>>> >> >> >>>>>>> Yeah, I noticed.  This patch would be ok for trunk (together with
>>> >>>>>>> >> >> >>>>>>> necessary parts from 1 and 2) if all not required parts are removed
>>> >>>>>>> >> >> >>>>>>> (and you'd add the testcases covering non-masked tail vect).
>>> >>>>>>> >> >> >>>>>>>
>>> >>>>>>> >> >> >>>>>>> Thus, can you please produce a single complete patch containing only
>>> >>>>>>> >> >> >>>>>>> non-masked epilogue vectoriziation?
>>> >>>>>>> >> >> >>>>>>>
>>> >>>>>>> >> >> >>>>>>> Thanks,
>>> >>>>>>> >> >> >>>>>>> Richard.
>>> >>>>>>> >> >> >>>>>>>
>>> >>>>>>> >> >> >>>>>>>> Thanks.
>>> >>>>>>> >> >> >>>>>>>> Yuri.
>>> >>>>>>> >> >> >>>>>>>>
>>> >>>>>>> >> >> >>>>>>>> 2016-11-14 15:51 GMT+03:00 Richard Biener <rguenther@suse.de>:
>>> >>>>>>> >> >> >>>>>>>> > On Fri, 11 Nov 2016, Yuri Rumyantsev wrote:
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> >> Richard,
>>> >>>>>>> >> >> >>>>>>>> >>
>>> >>>>>>> >> >> >>>>>>>> >> I prepare updated 3 patch with passing additional argument to
>>> >>>>>>> >> >> >>>>>>>> >> vect_analyze_loop as you proposed (untested).
>>> >>>>>>> >> >> >>>>>>>> >>
>>> >>>>>>> >> >> >>>>>>>> >> You wrote:
>>> >>>>>>> >> >> >>>>>>>> >> tw, I wonder if you can produce a single patch containing just
>>> >>>>>>> >> >> >>>>>>>> >> epilogue vectorization, that is combine patches 1-3 but rip out
>>> >>>>>>> >> >> >>>>>>>> >> changes only needed by later patches?
>>> >>>>>>> >> >> >>>>>>>> >>
>>> >>>>>>> >> >> >>>>>>>> >> Did you mean that I exclude all support for vectorization
>>> >>>>>>> >> >> >>>>>>epilogues,
>>> >>>>>>> >> >> >>>>>>>> >> i.e. exclude from 2-nd patch all non-related changes
>>> >>>>>>> >> >> >>>>>>>> >> like
>>> >>>>>>> >> >> >>>>>>>> >>
>>> >>>>>>> >> >> >>>>>>>> >> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>>> >>>>>>> >> >> >>>>>>>> >> index 11863af..32011c1 100644
>>> >>>>>>> >> >> >>>>>>>> >> --- a/gcc/tree-vect-loop.c
>>> >>>>>>> >> >> >>>>>>>> >> +++ b/gcc/tree-vect-loop.c
>>> >>>>>>> >> >> >>>>>>>> >> @@ -1120,6 +1120,12 @@ new_loop_vec_info (struct loop *loop)
>>> >>>>>>> >> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
>>> >>>>>>> >> >> >>>>>>>> >>    LOOP_VINFO_PEELING_FOR_NITER (res) = false;
>>> >>>>>>> >> >> >>>>>>>> >>    LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
>>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_CAN_BE_MASKED (res) = false;
>>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_REQUIRED_MASKS (res) = 0;
>>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_COMBINE_EPILOGUE (res) = false;
>>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_MASK_EPILOGUE (res) = false;
>>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_NEED_MASKING (res) = false;
>>> >>>>>>> >> >> >>>>>>>> >> +  LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL;
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> > Yes.
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> >> Did you mean also that new combined patch must be working patch,
>>> >>>>>>> >> >> >>>>>>i.e.
>>> >>>>>>> >> >> >>>>>>>> >> can be integrated without other patches?
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> > Yes.
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> >> Could you please look at updated patch?
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> > Will do.
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> > Thanks,
>>> >>>>>>> >> >> >>>>>>>> > Richard.
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> >> Thanks.
>>> >>>>>>> >> >> >>>>>>>> >> Yuri.
>>> >>>>>>> >> >> >>>>>>>> >>
>>> >>>>>>> >> >> >>>>>>>> >> 2016-11-10 15:36 GMT+03:00 Richard Biener <rguenther@suse.de>:
>>> >>>>>>> >> >> >>>>>>>> >> > On Thu, 10 Nov 2016, Richard Biener wrote:
>>> >>>>>>> >> >> >>>>>>>> >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> On Tue, 8 Nov 2016, Yuri Rumyantsev wrote:
>>> >>>>>>> >> >> >>>>>>>> >> >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > Richard,
>>> >>>>>>> >> >> >>>>>>>> >> >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > Here is updated 3 patch.
>>> >>>>>>> >> >> >>>>>>>> >> >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > I checked that all new tests related to epilogue
>>> >>>>>>> >> >> >>>>>>vectorization passed with it.
>>> >>>>>>> >> >> >>>>>>>> >> >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > Your comments will be appreciated.
>>> >>>>>>> >> >> >>>>>>>> >> >>
>>> >>>>>>> >> >> >>>>>>>> >> >> A lot better now.  Instead of the ->aux dance I now prefer to
>>> >>>>>>> >> >> >>>>>>>> >> >> pass the original loops loop_vinfo to vect_analyze_loop as
>>> >>>>>>> >> >> >>>>>>>> >> >> optional argument (if non-NULL we analyze the epilogue of that
>>> >>>>>>> >> >> >>>>>>>> >> >> loop_vinfo).  OTOH I remember we mainly use it to get at the
>>> >>>>>>> >> >> >>>>>>>> >> >> original vectorization factor?  So we can pass down an
>>> >>>>>>> >> >> >>>>>>(optional)
>>> >>>>>>> >> >> >>>>>>>> >> >> forced vectorization factor as well?
>>> >>>>>>> >> >> >>>>>>>> >> >
>>> >>>>>>> >> >> >>>>>>>> >> > Btw, I wonder if you can produce a single patch containing just
>>> >>>>>>> >> >> >>>>>>>> >> > epilogue vectorization, that is combine patches 1-3 but rip out
>>> >>>>>>> >> >> >>>>>>>> >> > changes only needed by later patches?
>>> >>>>>>> >> >> >>>>>>>> >> >
>>> >>>>>>> >> >> >>>>>>>> >> > Thanks,
>>> >>>>>>> >> >> >>>>>>>> >> > Richard.
>>> >>>>>>> >> >> >>>>>>>> >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> Richard.
>>> >>>>>>> >> >> >>>>>>>> >> >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > 2016-11-08 15:38 GMT+03:00 Richard Biener
>>> >>>>>>> >> >> >>>>>><rguenther@suse.de>:
>>> >>>>>>> >> >> >>>>>>>> >> >> > > On Thu, 3 Nov 2016, Yuri Rumyantsev wrote:
>>> >>>>>>> >> >> >>>>>>>> >> >> > >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> Hi Richard,
>>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> I did not understand your last remark:
>>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >           && dump_enabled_p ())
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
>>> >>>>>>> >> >> >>>>>>vect_location,
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >                            "loop vectorized\n");
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >        /* Now that the loop has been vectorized, allow
>>> >>>>>>> >> >> >>>>>>it to be unrolled
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >           etc.  */
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >      loop->force_vectorize = false;
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
>>> >>>>>>> >> >> >>>>>>it easier
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
>>> >>>>>>> >> >> >>>>>>in dumps
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
>>> >>>>>>> >> >> >>>>>>*/
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       if (new_loop)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +         {
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +         }
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
>>> >>>>>>> >> >> >>>>>>new_loop)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> f> unction which will set up stuff properly (and also
>>> >>>>>>> >> >> >>>>>>perform
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
>>> >>>>>>> >> >> >>>>>>vectorization
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > separately that would be great.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> Could you please clarify your proposal.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >
>>> >>>>>>> >> >> >>>>>>>> >> >> > > When a loop was vectorized set things up to immediately
>>> >>>>>>> >> >> >>>>>>vectorize
>>> >>>>>>> >> >> >>>>>>>> >> >> > > its epilogue, avoiding changing the loop iteration and
>>> >>>>>>> >> >> >>>>>>avoiding
>>> >>>>>>> >> >> >>>>>>>> >> >> > > the re-use of ->aux.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >
>>> >>>>>>> >> >> >>>>>>>> >> >> > > Richard.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> Thanks.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> Yuri.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> 2016-11-02 15:27 GMT+03:00 Richard Biener
>>> >>>>>>> >> >> >>>>>><rguenther@suse.de>:
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > On Tue, 1 Nov 2016, Yuri Rumyantsev wrote:
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> Hi All,
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> I re-send all patches sent by Ilya earlier for review
>>> >>>>>>> >> >> >>>>>>which support
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> vectorization of loop epilogues and loops with low
>>> >>>>>>> >> >> >>>>>>trip count. We
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> assume that the only patch -
>>> >>>>>>> >> >> >>>>>>vec-tails-07-combine-tail.patch - was not
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> approved by Jeff.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> I did re-base of all patches and performed
>>> >>>>>>> >> >> >>>>>>bootstrapping and
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> regression testing that did not show any new failures.
>>> >>>>>>> >> >> >>>>>>Also all
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> changes related to new vect_do_peeling algorithm have
>>> >>>>>>> >> >> >>>>>>been changed
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> accordingly.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >> Is it OK for trunk?
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > I would have prefered that the series up to
>>> >>>>>>> >> >> >>>>>>-03-nomask-tails would
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > _only_ contain epilogue loop vectorization changes but
>>> >>>>>>> >> >> >>>>>>unfortunately
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > the patchset is oddly separated.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > I have a comment on that part nevertheless:
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > @@ -1608,7 +1614,10 @@ vect_enhance_data_refs_alignment
>>> >>>>>>> >> >> >>>>>>(loop_vec_info
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > loop_vinfo)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    /* Check if we can possibly peel the loop.  */
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    if (!vect_can_advance_ivs_p (loop_vinfo)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >        || !slpeel_can_duplicate_loop_p (loop,
>>> >>>>>>> >> >> >>>>>>single_exit (loop))
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > -      || loop->inner)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +      || loop->inner
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +      /* Required peeling was performed in prologue
>>> >>>>>>> >> >> >>>>>>and
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +        is not required for epilogue.  */
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +      || LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >      do_peeling = false;
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    if (do_peeling
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > @@ -1888,7 +1897,10 @@ vect_enhance_data_refs_alignment
>>> >>>>>>> >> >> >>>>>>(loop_vec_info
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > loop_vinfo)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    do_versioning =
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         optimize_loop_nest_for_speed_p (loop)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > -       && (!loop->inner); /* FORNOW */
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       && (!loop->inner) /* FORNOW */
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +        /* Required versioning was performed for the
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          original loop and is not required for
>>> >>>>>>> >> >> >>>>>>epilogue.  */
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >    if (do_versioning)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >      {
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > please do that check in the single caller of this
>>> >>>>>>> >> >> >>>>>>function.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > Otherwise I still dislike the new ->aux use and I
>>> >>>>>>> >> >> >>>>>>believe that simply
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > passing down info from the processed parent would be
>>> >>>>>>> >> >> >>>>>>_much_ cleaner.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > That is, here (and avoid the FOR_EACH_LOOP change):
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > @@ -580,12 +586,21 @@ vectorize_loops (void)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >             && dump_enabled_p ())
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >            dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
>>> >>>>>>> >> >> >>>>>>vect_location,
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >                             "loop vectorized\n");
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > -       vect_transform_loop (loop_vinfo);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       new_loop = vect_transform_loop (loop_vinfo);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         num_vectorized_loops++;
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         /* Now that the loop has been vectorized, allow
>>> >>>>>>> >> >> >>>>>>it to be unrolled
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >            etc.  */
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >         loop->force_vectorize = false;
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       /* Add new loop to a processing queue.  To make
>>> >>>>>>> >> >> >>>>>>it easier
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          to match loop and its epilogue vectorization
>>> >>>>>>> >> >> >>>>>>in dumps
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +          put new loop as the next loop to process.
>>> >>>>>>> >> >> >>>>>>*/
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +       if (new_loop)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +         {
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +           loops.safe_insert (i + 1, new_loop->num);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +           vect_loops_num = number_of_loops (cfun);
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > +         }
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > simply dispatch to a vectorize_epilogue (loop_vinfo,
>>> >>>>>>> >> >> >>>>>>new_loop)
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > function which will set up stuff properly (and also
>>> >>>>>>> >> >> >>>>>>perform
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > the if-conversion of the epilogue there).
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > That said, if we can get in non-masked epilogue
>>> >>>>>>> >> >> >>>>>>vectorization
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > separately that would be great.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > I'm still torn about all the rest of the stuff and
>>> >>>>>>> >> >> >>>>>>question its
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > usability (esp. merging the epilogue with the main
>>> >>>>>>> >> >> >>>>>>vector loop).
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > But it has already been approved ... oh well.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> >
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > Thanks,
>>> >>>>>>> >> >> >>>>>>>> >> >> > >> > Richard.
>>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >>
>>> >>>>>>> >> >> >>>>>>>> >> >> > >
>>> >>>>>>> >> >> >>>>>>>> >> >> > > --
>>> >>>>>>> >> >> >>>>>>>> >> >> > > Richard Biener <rguenther@suse.de>
>>> >>>>>>> >> >> >>>>>>>> >> >> > > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard,
>>> >>>>>>> >> >> >>>>>>Graham Norton, HRB 21284 (AG Nuernberg)
>>> >>>>>>> >> >> >>>>>>>> >> >> >
>>> >>>>>>> >> >> >>>>>>>> >> >>
>>> >>>>>>> >> >> >>>>>>>> >> >>
>>> >>>>>>> >> >> >>>>>>>> >> >
>>> >>>>>>> >> >> >>>>>>>> >> > --
>>> >>>>>>> >> >> >>>>>>>> >> > Richard Biener <rguenther@suse.de>
>>> >>>>>>> >> >> >>>>>>>> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>>> >>>>>>> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>>> >>>>>>> >> >> >>>>>>>> >>
>>> >>>>>>> >> >> >>>>>>>> >
>>> >>>>>>> >> >> >>>>>>>> > --
>>> >>>>>>> >> >> >>>>>>>> > Richard Biener <rguenther@suse.de>
>>> >>>>>>> >> >> >>>>>>>> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>>> >>>>>>> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>>> >>>>>>> >> >> >>>>>>>>
>>> >>>>>>> >> >> >>>>>>>
>>> >>>>>>> >> >> >>>>>>> --
>>> >>>>>>> >> >> >>>>>>> Richard Biener <rguenther@suse.de>
>>> >>>>>>> >> >> >>>>>>> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham
>>> >>>>>>> >> >> >>>>>>Norton, HRB 21284 (AG Nuernberg)
>>> >>>>>>> >> >> >>>>>
>>> >>>>>>> >> >> >>>>>
>>> >>>>>>> >> >>
>>> >>>>>>> >> >
>>> >>>>>>> >> > --
>>> >>>>>>> >> > Richard Biener <rguenther@suse.de>
>>> >>>>>>> >> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
>>> >>>>>>> >>
>>> >>>>>>> >
>>> >>>>>>> > --
>>> >>>>>>> > Richard Biener <rguenther@suse.de>
>>> >>>>>>> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
>>> >>>>>>>
>>> >>>>>>>
>>> >>>>>>
>>> >>>>>> --
>>> >>>>>> Richard Biener <rguenther@suse.de>
>>> >>>>>> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
>>>
>>
>> --
>> Richard Biener <rguenther@suse.de>
>> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
diff mbox

Patch

diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index e13d6a2..36be342 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1635,6 +1635,13 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree 
niters, tree nitersm1,
   bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
                         || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));

+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    {
+      prolog_peeling = false;
+      if (LOOP_VINFO_MASK_LOOP (loop_vinfo))
+       epilog_peeling = false;
+    }
+
   if (!prolog_peeling && !epilog_peeling)
     return NULL;