diff mbox series

[5/5] vect: Support masked gather loads with SLP

Message ID mptfss1mequ.fsf@arm.com
State New
Headers show
Series [1/5] vect: Use code_helper when building SLP nodes | expand

Commit Message

Richard Sandiford Nov. 12, 2021, 6:05 p.m. UTC
This patch extends the previous SLP gather load support so
that it can handle masked loads too.

Regstrapped on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


gcc/
	* tree-vect-slp.c (arg1_arg4_map): New variable.
	(vect_get_operand_map): Handle IFN_MASK_GATHER_LOAD.
	(vect_build_slp_tree_1): Likewise.
	(vect_build_slp_tree_2): Likewise.
	* tree-vect-stmts.c (vectorizable_load): Expect the mask to be
	the last SLP child node rather than the first.

gcc/testsuite/
	* gcc.dg/vect/vect-gather-3.c: New test.
	* gcc.dg/vect/vect-gather-4.c: Likewise.
	* gcc.target/aarch64/sve/mask_gather_load_8.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/vect-gather-3.c     | 64 ++++++++++++++++++
 gcc/testsuite/gcc.dg/vect/vect-gather-4.c     | 48 ++++++++++++++
 .../aarch64/sve/mask_gather_load_8.c          | 65 +++++++++++++++++++
 gcc/tree-vect-slp.c                           | 15 ++++-
 gcc/tree-vect-stmts.c                         | 21 ++++--
 5 files changed, 203 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c

Comments

Richard Biener Nov. 16, 2021, 12:14 p.m. UTC | #1
On Fri, Nov 12, 2021 at 7:06 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This patch extends the previous SLP gather load support so
> that it can handle masked loads too.
>
> Regstrapped on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

OK.

Thanks,
Richard.

> Richard
>
>
> gcc/
>         * tree-vect-slp.c (arg1_arg4_map): New variable.
>         (vect_get_operand_map): Handle IFN_MASK_GATHER_LOAD.
>         (vect_build_slp_tree_1): Likewise.
>         (vect_build_slp_tree_2): Likewise.
>         * tree-vect-stmts.c (vectorizable_load): Expect the mask to be
>         the last SLP child node rather than the first.
>
> gcc/testsuite/
>         * gcc.dg/vect/vect-gather-3.c: New test.
>         * gcc.dg/vect/vect-gather-4.c: Likewise.
>         * gcc.target/aarch64/sve/mask_gather_load_8.c: Likewise.
> ---
>  gcc/testsuite/gcc.dg/vect/vect-gather-3.c     | 64 ++++++++++++++++++
>  gcc/testsuite/gcc.dg/vect/vect-gather-4.c     | 48 ++++++++++++++
>  .../aarch64/sve/mask_gather_load_8.c          | 65 +++++++++++++++++++
>  gcc/tree-vect-slp.c                           | 15 ++++-
>  gcc/tree-vect-stmts.c                         | 21 ++++--
>  5 files changed, 203 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-4.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-3.c b/gcc/testsuite/gcc.dg/vect/vect-gather-3.c
> new file mode 100644
> index 00000000000..738bd3f3106
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-3.c
> @@ -0,0 +1,64 @@
> +#include "tree-vect.h"
> +
> +#define N 16
> +
> +void __attribute__((noipa))
> +f (int *restrict y, int *restrict x, int *restrict indices)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      y[i * 2] = (indices[i * 2] < N * 2
> +                 ? x[indices[i * 2]] + 1
> +                 : 1);
> +      y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
> +                     ? x[indices[i * 2 + 1]] + 2
> +                     : 2);
> +    }
> +}
> +
> +int y[N * 2];
> +int x[N * 2] = {
> +  72704, 52152, 51301, 96681,
> +  57937, 60490, 34504, 60944,
> +  42225, 28333, 88336, 74300,
> +  29250, 20484, 38852, 91536,
> +  86917, 63941, 31590, 21998,
> +  22419, 26974, 28668, 13968,
> +  3451, 20247, 44089, 85521,
> +  22871, 87362, 50555, 85939
> +};
> +int indices[N * 2] = {
> +  15, 0x10000, 0xcafe0, 19,
> +  7, 22, 19, 1,
> +  0x20000, 0x70000, 15, 30,
> +  5, 12, 11, 11,
> +  10, 25, 5, 20,
> +  22, 24, 32, 28,
> +  30, 19, 6, 0xabcdef,
> +  7, 12, 8, 21
> +};
> +int expected[N * 2] = {
> +  91537, 2, 1, 22000,
> +  60945, 28670, 21999, 52154,
> +  1, 2, 91537, 50557,
> +  60491, 29252, 74301, 74302,
> +  88337, 20249, 60491, 22421,
> +  28669, 3453, 1, 22873,
> +  50556, 22000, 34505, 2,
> +  60945, 29252, 42226, 26976
> +};
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  f (y, x, indices);
> +  for (int i = 0; i < 32; ++i)
> +    if (y[i] != expected[i])
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target { vect_gather_load_ifn && vect_masked_load } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
> new file mode 100644
> index 00000000000..ee2e4e4999a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
> @@ -0,0 +1,48 @@
> +/* { dg-do compile } */
> +
> +#define N 16
> +
> +void
> +f1 (int *restrict y, int *restrict x1, int *restrict x2,
> +    int *restrict indices)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      y[i * 2] = (indices[i * 2] < N * 2
> +                 ? x1[indices[i * 2]] + 1
> +                 : 1);
> +      y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
> +                     ? x2[indices[i * 2 + 1]] + 2
> +                     : 2);
> +    }
> +}
> +
> +void
> +f2 (int *restrict y, int *restrict x, int *restrict indices)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      y[i * 2] = (indices[i * 2] < N * 2
> +                 ? x[indices[i * 2]] + 1
> +                 : 1);
> +      y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
> +                     ? x[indices[i * 2 + 1] * 2] + 2
> +                     : 2);
> +    }
> +}
> +
> +void
> +f3 (int *restrict y, int *restrict x, int *restrict indices)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      y[i * 2] = (indices[i * 2] < N * 2
> +                 ? x[indices[i * 2]] + 1
> +                 : 1);
> +      y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
> +                     ? x[(unsigned int) indices[i * 2 + 1]] + 2
> +                     : 2);
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect { target vect_gather_load_ifn } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c
> new file mode 100644
> index 00000000000..95767f30a80
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c
> @@ -0,0 +1,65 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-vect-cost-model" } */
> +
> +#include <stdint.h>
> +
> +void
> +f1 (int32_t *restrict y, int32_t *restrict x, int32_t *restrict index)
> +{
> +  for (int i = 0; i < 100; ++i)
> +    {
> +      y[i * 2] = (index[i * 2] < 128
> +                 ? x[index[i * 2]] + 1
> +                 : 1);
> +      y[i * 2 + 1] = (index[i * 2 + 1] < 128
> +                     ? x[index[i * 2 + 1]] + 2
> +                     : 2);
> +    }
> +}
> +
> +void
> +f2 (int32_t *restrict y, int32_t *restrict x, uint32_t *restrict index)
> +{
> +  for (int i = 0; i < 100; ++i)
> +    {
> +      y[i * 2] = (index[i * 2] < 128
> +                 ? x[index[i * 2]] + 1
> +                 : 1);
> +      y[i * 2 + 1] = (index[i * 2 + 1] < 128
> +                     ? x[index[i * 2 + 1]] + 2
> +                     : 2);
> +    }
> +}
> +
> +void
> +f3 (int32_t *restrict y, int32_t *restrict x, uint64_t *restrict index)
> +{
> +  for (int i = 0; i < 100; ++i)
> +    {
> +      y[i * 2] = (index[i * 2] < 128
> +                 ? x[index[i * 2]] + 1
> +                 : 1);
> +      y[i * 2 + 1] = (index[i * 2 + 1] < 128
> +                     ? x[index[i * 2 + 1]] + 2
> +                     : 2);
> +    }
> +}
> +
> +void
> +f4 (int64_t *restrict y, int64_t *restrict x, uint64_t *restrict index)
> +{
> +  for (int i = 0; i < 100; ++i)
> +    {
> +      y[i * 2] = (index[i * 2] < 128
> +                 ? x[index[i * 2]] + 1
> +                 : 1);
> +      y[i * 2 + 1] = (index[i * 2 + 1] < 128
> +                     ? x[index[i * 2 + 1]] + 2
> +                     : 2);
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw #?2\]} 1 } } */
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw #?2\]} 1 } } */
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl #?2\]} 1 } } */
> +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl #?3\]} 1 } } */
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 0f09fc1fda8..35ec2e2ad5e 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -461,6 +461,7 @@ static const int cond_expr_maps[3][5] = {
>  };
>  static const int arg1_map[] = { 1, 1 };
>  static const int arg2_map[] = { 1, 2 };
> +static const int arg1_arg4_map[] = { 2, 1, 4 };
>
>  /* For most SLP statements, there is a one-to-one mapping between
>     gimple arguments and child nodes.  If that is not true for STMT,
> @@ -494,6 +495,9 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
>           case IFN_GATHER_LOAD:
>             return arg1_map;
>
> +         case IFN_MASK_GATHER_LOAD:
> +           return arg1_arg4_map;
> +
>           default:
>             break;
>           }
> @@ -1000,7 +1004,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
>           else
>             rhs_code = CALL_EXPR;
>
> -         if (cfn == CFN_MASK_LOAD || cfn == CFN_GATHER_LOAD)
> +         if (cfn == CFN_MASK_LOAD
> +             || cfn == CFN_GATHER_LOAD
> +             || cfn == CFN_MASK_GATHER_LOAD)
>             load_p = true;
>           else if ((internal_fn_p (cfn)
>                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
> @@ -1229,7 +1235,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
>          } /* Grouped access.  */
>        else
>         {
> -         if (load_p && rhs_code != CFN_GATHER_LOAD)
> +         if (load_p
> +             && rhs_code != CFN_GATHER_LOAD
> +             && rhs_code != CFN_MASK_GATHER_LOAD)
>             {
>               /* Not grouped load.  */
>               if (dump_enabled_p ())
> @@ -1711,7 +1719,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>      {
>        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
>         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
> -                   || gimple_call_internal_p (stmt, IFN_GATHER_LOAD));
> +                   || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
> +                   || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
>        else
>         {
>           *max_nunits = this_max_nunits;
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 06da5a9bc13..8642acbc0b4 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -8595,6 +8595,7 @@ vectorizable_load (vec_info *vinfo,
>      return false;
>
>    tree mask = NULL_TREE, mask_vectype = NULL_TREE;
> +  int mask_index = -1;
>    if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
>      {
>        scalar_dest = gimple_assign_lhs (assign);
> @@ -8626,12 +8627,12 @@ vectorizable_load (vec_info *vinfo,
>        if (!scalar_dest)
>         return false;
>
> -      int mask_index = internal_fn_mask_index (ifn);
> +      mask_index = internal_fn_mask_index (ifn);
> +      /* ??? For SLP the mask operand is always last.  */
> +      if (mask_index >= 0 && slp_node)
> +       mask_index = SLP_TREE_CHILDREN (slp_node).length () - 1;
>        if (mask_index >= 0
> -         && !vect_check_scalar_mask (vinfo, stmt_info, slp_node,
> -                                     /* ??? For SLP we only have operands for
> -                                        the mask operand.  */
> -                                     slp_node ? 0 : mask_index,
> +         && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
>                                       &mask, NULL, &mask_dt, &mask_vectype))
>         return false;
>      }
> @@ -9393,8 +9394,14 @@ vectorizable_load (vec_info *vinfo,
>    vec<tree> vec_offsets = vNULL;
>    auto_vec<tree> vec_masks;
>    if (mask)
> -    vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
> -                      mask, &vec_masks, mask_vectype, NULL_TREE);
> +    {
> +      if (slp_node)
> +       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[mask_index],
> +                          &vec_masks);
> +      else
> +       vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
> +                                      &vec_masks, mask_vectype);
> +    }
>    tree vec_mask = NULL_TREE;
>    poly_uint64 group_elt = 0;
>    for (j = 0; j < ncopies; j++)
> --
> 2.25.1
>
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-3.c b/gcc/testsuite/gcc.dg/vect/vect-gather-3.c
new file mode 100644
index 00000000000..738bd3f3106
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-gather-3.c
@@ -0,0 +1,64 @@ 
+#include "tree-vect.h"
+
+#define N 16
+
+void __attribute__((noipa))
+f (int *restrict y, int *restrict x, int *restrict indices)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      y[i * 2] = (indices[i * 2] < N * 2
+		  ? x[indices[i * 2]] + 1
+		  : 1);
+      y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
+		      ? x[indices[i * 2 + 1]] + 2
+		      : 2);
+    }
+}
+
+int y[N * 2];
+int x[N * 2] = {
+  72704, 52152, 51301, 96681,
+  57937, 60490, 34504, 60944,
+  42225, 28333, 88336, 74300,
+  29250, 20484, 38852, 91536,
+  86917, 63941, 31590, 21998,
+  22419, 26974, 28668, 13968,
+  3451, 20247, 44089, 85521,
+  22871, 87362, 50555, 85939
+};
+int indices[N * 2] = {
+  15, 0x10000, 0xcafe0, 19,
+  7, 22, 19, 1,
+  0x20000, 0x70000, 15, 30,
+  5, 12, 11, 11,
+  10, 25, 5, 20,
+  22, 24, 32, 28,
+  30, 19, 6, 0xabcdef,
+  7, 12, 8, 21
+};
+int expected[N * 2] = {
+  91537, 2, 1, 22000,
+  60945, 28670, 21999, 52154,
+  1, 2, 91537, 50557,
+  60491, 29252, 74301, 74302,
+  88337, 20249, 60491, 22421,
+  28669, 3453, 1, 22873,
+  50556, 22000, 34505, 2,
+  60945, 29252, 42226, 26976
+};
+
+int
+main (void)
+{
+  check_vect ();
+
+  f (y, x, indices);
+  for (int i = 0; i < 32; ++i)
+    if (y[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target { vect_gather_load_ifn && vect_masked_load } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
new file mode 100644
index 00000000000..ee2e4e4999a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
@@ -0,0 +1,48 @@ 
+/* { dg-do compile } */
+
+#define N 16
+
+void
+f1 (int *restrict y, int *restrict x1, int *restrict x2,
+    int *restrict indices)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      y[i * 2] = (indices[i * 2] < N * 2
+		  ? x1[indices[i * 2]] + 1
+		  : 1);
+      y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
+		      ? x2[indices[i * 2 + 1]] + 2
+		      : 2);
+    }
+}
+
+void
+f2 (int *restrict y, int *restrict x, int *restrict indices)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      y[i * 2] = (indices[i * 2] < N * 2
+		  ? x[indices[i * 2]] + 1
+		  : 1);
+      y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
+		      ? x[indices[i * 2 + 1] * 2] + 2
+		      : 2);
+    }
+}
+
+void
+f3 (int *restrict y, int *restrict x, int *restrict indices)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      y[i * 2] = (indices[i * 2] < N * 2
+		  ? x[indices[i * 2]] + 1
+		  : 1);
+      y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
+		      ? x[(unsigned int) indices[i * 2 + 1]] + 2
+		      : 2);
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect { target vect_gather_load_ifn } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c
new file mode 100644
index 00000000000..95767f30a80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_8.c
@@ -0,0 +1,65 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-vect-cost-model" } */
+
+#include <stdint.h>
+
+void
+f1 (int32_t *restrict y, int32_t *restrict x, int32_t *restrict index)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      y[i * 2] = (index[i * 2] < 128
+		  ? x[index[i * 2]] + 1
+		  : 1);
+      y[i * 2 + 1] = (index[i * 2 + 1] < 128
+		      ? x[index[i * 2 + 1]] + 2
+		      : 2);
+    }
+}
+
+void
+f2 (int32_t *restrict y, int32_t *restrict x, uint32_t *restrict index)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      y[i * 2] = (index[i * 2] < 128
+		  ? x[index[i * 2]] + 1
+		  : 1);
+      y[i * 2 + 1] = (index[i * 2 + 1] < 128
+		      ? x[index[i * 2 + 1]] + 2
+		      : 2);
+    }
+}
+
+void
+f3 (int32_t *restrict y, int32_t *restrict x, uint64_t *restrict index)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      y[i * 2] = (index[i * 2] < 128
+		  ? x[index[i * 2]] + 1
+		  : 1);
+      y[i * 2 + 1] = (index[i * 2 + 1] < 128
+		      ? x[index[i * 2 + 1]] + 2
+		      : 2);
+    }
+}
+
+void
+f4 (int64_t *restrict y, int64_t *restrict x, uint64_t *restrict index)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      y[i * 2] = (index[i * 2] < 128
+		  ? x[index[i * 2]] + 1
+		  : 1);
+      y[i * 2 + 1] = (index[i * 2 + 1] < 128
+		      ? x[index[i * 2 + 1]] + 2
+		      : 2);
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw #?2\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw #?2\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl #?2\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl #?3\]} 1 } } */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 0f09fc1fda8..35ec2e2ad5e 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -461,6 +461,7 @@  static const int cond_expr_maps[3][5] = {
 };
 static const int arg1_map[] = { 1, 1 };
 static const int arg2_map[] = { 1, 2 };
+static const int arg1_arg4_map[] = { 2, 1, 4 };
 
 /* For most SLP statements, there is a one-to-one mapping between
    gimple arguments and child nodes.  If that is not true for STMT,
@@ -494,6 +495,9 @@  vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 	  case IFN_GATHER_LOAD:
 	    return arg1_map;
 
+	  case IFN_MASK_GATHER_LOAD:
+	    return arg1_arg4_map;
+
 	  default:
 	    break;
 	  }
@@ -1000,7 +1004,9 @@  vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	  else
 	    rhs_code = CALL_EXPR;
 
-	  if (cfn == CFN_MASK_LOAD || cfn == CFN_GATHER_LOAD)
+	  if (cfn == CFN_MASK_LOAD
+	      || cfn == CFN_GATHER_LOAD
+	      || cfn == CFN_MASK_GATHER_LOAD)
 	    load_p = true;
 	  else if ((internal_fn_p (cfn)
 		    && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
@@ -1229,7 +1235,9 @@  vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
         } /* Grouped access.  */
       else
 	{
-	  if (load_p && rhs_code != CFN_GATHER_LOAD)
+	  if (load_p
+	      && rhs_code != CFN_GATHER_LOAD
+	      && rhs_code != CFN_MASK_GATHER_LOAD)
 	    {
 	      /* Not grouped load.  */
 	      if (dump_enabled_p ())
@@ -1711,7 +1719,8 @@  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
     {
       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 	gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
-		    || gimple_call_internal_p (stmt, IFN_GATHER_LOAD));
+		    || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
+		    || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
       else
 	{
 	  *max_nunits = this_max_nunits;
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 06da5a9bc13..8642acbc0b4 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -8595,6 +8595,7 @@  vectorizable_load (vec_info *vinfo,
     return false;
 
   tree mask = NULL_TREE, mask_vectype = NULL_TREE;
+  int mask_index = -1;
   if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
     {
       scalar_dest = gimple_assign_lhs (assign);
@@ -8626,12 +8627,12 @@  vectorizable_load (vec_info *vinfo,
       if (!scalar_dest)
 	return false;
 
-      int mask_index = internal_fn_mask_index (ifn);
+      mask_index = internal_fn_mask_index (ifn);
+      /* ??? For SLP the mask operand is always last.  */
+      if (mask_index >= 0 && slp_node)
+	mask_index = SLP_TREE_CHILDREN (slp_node).length () - 1;
       if (mask_index >= 0
-	  && !vect_check_scalar_mask (vinfo, stmt_info, slp_node,
-				      /* ??? For SLP we only have operands for
-					 the mask operand.  */
-				      slp_node ? 0 : mask_index,
+	  && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
 				      &mask, NULL, &mask_dt, &mask_vectype))
 	return false;
     }
@@ -9393,8 +9394,14 @@  vectorizable_load (vec_info *vinfo,
   vec<tree> vec_offsets = vNULL;
   auto_vec<tree> vec_masks;
   if (mask)
-    vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
-		       mask, &vec_masks, mask_vectype, NULL_TREE);
+    {
+      if (slp_node)
+	vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[mask_index],
+			   &vec_masks);
+      else
+	vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
+				       &vec_masks, mask_vectype);
+    }
   tree vec_mask = NULL_TREE;
   poly_uint64 group_elt = 0;
   for (j = 0; j < ncopies; j++)