===================================================================
@@ -0,0 +1,1487 @@
+/* Subroutines used to transform array subscripting expressions into
+ forms that are more amenable to d-form instruction selection for p9
+ little-endian VSX code.
+ Copyright (C) 1991-2018 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "df.h"
+#include "tm_p.h"
+#include "ira.h"
+#include "print-tree.h"
+#include "varasm.h"
+#include "explow.h"
+#include "expr.h"
+#include "output.h"
+#include "tree-pass.h"
+#include "rtx-vector-builder.h"
+#include "cfgloop.h"
+
+#include "insn-config.h"
+#include "recog.h"
+
+#include "print-rtl.h"
+#include "tree-pretty-print.h"
+
+#include "genrtl.h"
+
+/* This pass transforms array indexing expressions from a form that
+ favors selection of X-form instructions into a form that favors
+ selection of D-form instructions.
+
+ Showing favor for D-form instructions is especially important when
+ targeting Power9, as the Power9 architecture added a number of new
+ D-form instruction capabilities.
+
+ Consider, for example, the following loop, excerpted from an actual
+ program:
+
+ double sacc, x[], y[], z[];
+ sacc = 0.00;
+ for (unsigned long long int i = 0; i < N; i++) {
+ z[i] = x[i] * y[i];
+ sacc += z[i];
+ }
+
+ Compile this program with the following gcc options which enable both
+ vectorization and loop unrolling:
+ -m64 -fdump-rtl-all-details -mcpu=power9 -mtune=power9 -funroll-loops -O3
+
+ Without this pass, this loop is represented by the following:
+
+ lxvx: 16
+ addi: 8
+ xvmuldp: 8
+ stxvx: 8
+ fmr: 8
+ xxpermdi: 8
+ fadd: 16
+ bdnz: 1
+ ___
+ total: 73 instructions
+
+.L3:
+ lxvx 0,29,11
+ lxvx 12,30,11
+ addi 12,11,16
+ addi 0,11,48
+ addi 5,11,64
+ addi 9,11,32
+ addi 6,11,80
+ addi 7,11,96
+ addi 8,11,112
+ lxvx 2,29,12
+ lxvx 3,30,12
+ lxvx 4,29,0
+ lxvx 5,30,0
+ lxvx 10,30,9
+ lxvx 11,29,5
+ xvmuldp 6,0,12
+ lxvx 13,30,5
+ lxvx 8,29,9
+ lxvx 27,29,6
+ lxvx 28,30,6
+ xvmuldp 7,2,3
+ lxvx 29,29,7
+ lxvx 30,30,7
+ xvmuldp 9,4,5
+ lxvx 3,30,8
+ lxvx 0,29,8
+ xvmuldp 8,8,10
+ xvmuldp 10,11,13
+ xvmuldp 11,27,28
+ xxpermdi 26,6,6,3
+ fmr 2,6
+ stxvx 6,31,11
+ xvmuldp 12,29,30
+ addi 11,11,128
+ fadd 1,26,1
+ xxpermdi 26,7,7,3
+ stxvx 7,31,12
+ fmr 27,7
+ xvmuldp 0,0,3
+ xxpermdi 30,9,9,3
+ fmr 31,9
+ stxvx 8,31,9
+ xxpermdi 13,10,10,3
+ xxpermdi 28,8,8,3
+ stxvx 9,31,0
+ stxvx 10,31,5
+ fadd 1,2,1
+ fmr 2,10
+ xxpermdi 3,11,11,3
+ stxvx 11,31,6
+ fmr 4,11
+ fmr 29,8
+ xxpermdi 5,12,12,3
+ fmr 6,12
+ stxvx 12,31,7
+ xxpermdi 9,0,0,3
+ fmr 8,0
+ fadd 7,26,1
+ stxvx 0,31,8
+ fadd 10,27,7
+ fadd 11,28,10
+ fadd 12,29,11
+ fadd 26,30,12
+ fadd 27,31,26
+ fadd 30,13,27
+ fadd 31,2,30
+ fadd 0,3,31
+ fadd 28,4,0
+ fadd 29,5,28
+ fadd 13,6,29
+ fadd 1,9,13
+ fadd 1,8,1
+ bdnz .L3
+
+ With this pass, the same loop is represented by:
+
+ lxvx: 4
+ lxv: 12
+ addi: 2
+ add: 3
+ xvmuldp: 8
+ stxvx: 2
+ stxv: 6
+ fmr: 8
+ xxpermdi: 8
+ fadd: 16
+ bdnz: 1
+ ___
+ total: 70 instructions
+
+.L3:
+ lxvx 0,29,6
+ lxvx 12,30,6
+ addi 10,6,16
+ add 7,29,10
+ add 8,30,10
+ lxvx 2,29,10
+ lxvx 3,30,10
+ add 11,31,10
+ lxv 4,16(7)
+ lxv 9,16(8)
+ xvmuldp 6,0,12
+ lxv 13,64(8)
+ lxv 5,32(7)
+ lxv 28,32(8)
+ lxv 11,64(7)
+ xvmuldp 7,2,3
+ lxv 30,80(7)
+ lxv 12,80(8)
+ lxv 31,48(8)
+ lxv 10,48(7)
+ xvmuldp 8,4,9
+ lxv 3,96(8)
+ lxv 0,96(7)
+ xvmuldp 9,5,28
+ xvmuldp 11,11,13
+ xxpermdi 29,6,6,3
+ fmr 4,6
+ stxvx 6,31,6
+ xvmuldp 12,30,12
+ addi 6,6,128
+ fadd 1,29,1
+ xxpermdi 28,7,7,3
+ fmr 29,7
+ stxvx 7,31,10
+ xvmuldp 10,10,31
+ xxpermdi 30,8,8,3
+ fmr 31,8
+ stxv 8,16(11)
+ xvmuldp 0,0,3
+ xxpermdi 5,11,11,3
+ fmr 6,11
+ xxpermdi 13,9,9,3
+ fadd 1,4,1
+ stxv 11,64(11)
+ xxpermdi 7,12,12,3
+ fmr 8,12
+ stxv 12,80(11)
+ fmr 2,9
+ xxpermdi 3,10,10,3
+ fmr 4,10
+ stxv 9,32(11)
+ stxv 10,48(11)
+ fadd 28,28,1
+ xxpermdi 9,0,0,3
+ fmr 10,0
+ stxv 0,96(11)
+ fadd 11,29,28
+ fadd 29,30,11
+ fadd 12,31,29
+ fadd 30,13,12
+ fadd 31,2,30
+ fadd 13,3,31
+ fadd 2,4,13
+ fadd 1,5,2
+ fadd 0,6,1
+ fadd 3,7,0
+ fadd 4,8,3
+ fadd 5,9,4
+ fadd 1,10,5
+ bdnz .L3
+
+ The optimized loop body replaces 12 lxvx instructions with lxv
+ instructions, 6 stxvx instructions with stxv, and has 3 fewer add
+ operations.
+
+ This pass runs immediately after pass_loops2. Loops have already
+ been unrolled. The pass searches
+ for sequences of code of the form:
+
+ A0: *(array_base + offset)
+
+ Aij: (optional, for j = 1 .. Ki, with different Ki
+ for each value of i. If Ki equals 0, there
+ are no constant adjustments to offset for
+ this value of i)
+ offset += constant (i, j)
+
+ Ai: (for i = 1 .. N)
+ *(array_base + offset)
+
+ where for each i = 1 to N and j = 1 to K-1
+ A(i-1) dominates A(i), and
+ A(i-1) dominates A(i, 1) and
+ A(i,j) dominates A(i, j+1) and
+ A(i,K) dominanes A(i) and
+ there are no other assignments to offset between A0 and AN
+
+ It replaces these sequences with:
+
+ A0: derived_pointer = array_base + offset
+ *(derived_pointer)
+
+ Aij: leave this alone. expect that subsequent optimization deletes
+ this code as it may become dead (since we don't use the
+ indexing expression following out code transformations.)
+
+ Ai:
+ *(derived_pointer + constant_i)
+ (where constant_i equals sum of constant (n,j) for all n from 1
+ to i paired with all j from 1 to Kn,
+
+ Note that each function may match this pattern multiple times.
+
+
+*/
+
+/* This is based on the union-find logic in web.c. web_entry_base is
+ defined in df.h. */
+class indexing_web_entry : public web_entry_base
+{
+ public:
+ rtx_insn *insn; /* Pointer to the insn */
+ basic_block bb; /* Pointer to the enclosing basic block */
+
+ /* If this insn is relevant, it is a load or store with a memory
+ address that is comprised of a base pointer (e.g. the address of
+ an array or array slice) and an index expression (e.g. an index
+ within the array). The original_base_use and original_index_use
+ fields represent the numbers of the instructions that define the
+ base and index values which are summed together with a constant
+ value to determine the value of this instruction's memory
+ address. */
+ unsigned int original_base_use;
+ unsigned int original_index_use;
+
+ /* If this insn is relevant, the register assigned by insn
+ original_base_use is original_base_reg. The insn assigned by insn
+ original_index_use is original_index_reg. */
+ unsigned int original_base_reg;
+ unsigned int original_index_reg;
+
+ /* If this insn is_relevant, this is the constant that is added to
+ the originating expression to calculate the value of this insn's
+ memory address. */
+ int base_delta;
+ int index_delta;
+
+ /* If this insn is relevant, it belongs to an equivalence class.
+ The equivalence classes are identified by the definitions that
+ define the inputs to this insn. */
+ unsigned int base_equivalence_hash;
+ unsigned int index_equivalence_hash;
+
+ /* When multiple insns fall within the same equivalence class, they
+ are linked together through this field. The value UINT_MAX
+ represents the end of this list. */
+ unsigned int equivalent_sibling;
+
+ unsigned int is_relevant : 1;
+ unsigned int is_load : 1;
+ unsigned int is_store : 1;
+ unsigned int is_originating : 1;
+};
+
+static int count_links (struct df_link *def_link)
+{
+ int result;
+ for (result = 0; def_link != NULL; result++)
+ def_link = def_link->next;
+ return result;
+}
+
+static int max_use_links = 0;
+
+static int
+int_compare (const void *v1, const void *v2)
+{
+ const int *i1 = (const int *) v1;
+ const int *i2 = (const int *) v2;
+ return *i1 - *i2;
+}
+
+static int
+help_hash (int count, struct df_link *def_link)
+{
+ int *ids;
+ int i = 0;
+
+ ids = (int *) alloca (count * sizeof (ids[0]));
+ if (count > max_use_links)
+ max_use_links = count;
+
+ while (def_link != NULL)
+ {
+ ids[i++] = DF_REF_ID (def_link->ref);
+ def_link = def_link->next;
+ }
+
+ /* sort to put ids in ascending order. */
+ qsort ((void *) ids, count, sizeof (ids[0]), int_compare);
+
+ int result = 0;
+ for (i = 0; i < count; i++)
+ {
+ result = (result << 6) | ((result & 0xf000000) >> 28);
+ result += ids[i];
+ }
+ return result;
+}
+
+static int
+equivalence_hash (struct df_link *def_link)
+{
+ int count = count_links (def_link);
+ return help_hash (count, def_link);
+}
+
+static
+void help_find_defs (indexing_web_entry *insn_entry,
+ unsigned int uid, rtx base_reg, rtx index_reg,
+ struct df_link **insn_base, struct df_link **insn_index)
+{
+ rtx_insn *insn = insn_entry[uid].insn;
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref use;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ if (rtx_equal_p (DF_REF_REG (use), base_reg))
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ /* if there is no def, or the def is artificial,
+ or there are multiple defs, this is an originating
+ use. */
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref)
+ || def_link->next)
+ *insn_base = def_link;
+ else
+ {
+ unsigned int uid2 =
+ insn_entry[uid].original_base_use;
+ df_ref use2;
+ if (uid2 > 0)
+ {
+ rtx_insn *insn2 = insn_entry[uid2].insn;
+ struct df_insn_info *insn_info2 =
+ DF_INSN_INFO_GET (insn2);
+ use2 = DF_INSN_INFO_USES (insn_info2);
+ if (use2)
+ *insn_base = DF_REF_CHAIN (use2);
+ else
+ *insn_base = NULL;
+ }
+ }
+ }
+ else if (rtx_equal_p (DF_REF_REG (use), index_reg))
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ /* if there is no def, or the def is artificial,
+ or there are multiple defs, this is an originating
+ use. */
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref)
+ || def_link->next)
+ *insn_index = def_link;
+ else
+ {
+ unsigned int uid2 =
+ insn_entry[uid].original_index_use;
+ df_ref use2;
+ if (uid2 > 0)
+ {
+ rtx_insn *insn2 = insn_entry[uid2].insn;
+ struct df_insn_info *insn_info2 =
+ DF_INSN_INFO_GET (insn2);
+ use2 = DF_INSN_INFO_USES (insn_info2);
+ if (use2)
+ *insn_index = DF_REF_CHAIN (use2);
+ else
+ *insn_index = NULL;
+ }
+ }
+ }
+ }
+}
+
+
+static void
+find_defs (indexing_web_entry *insn_entry, rtx_insn *insn,
+ struct df_link **insn_base, struct df_link **insn_index)
+{
+ unsigned int uid = INSN_UID (insn);
+ rtx body = PATTERN (insn);
+ rtx mem = NULL;
+ if (GET_CODE (body) == SET)
+ {
+ if (MEM_P (SET_SRC (body)))
+ mem = XEXP (SET_SRC (body), 0);
+ else if (MEM_P (SET_DEST (body)))
+ mem = XEXP (SET_DEST (body), 0);
+ }
+ /* Otherwise, this is neither load or store, so leave mem as NULL. */
+
+ if (mem != NULL)
+ {
+ enum rtx_code code = GET_CODE (mem);
+ if ((code == PLUS) || (code == MINUS))
+ {
+ rtx base_reg = XEXP (mem, 0);
+ rtx index_reg = XEXP (mem, 1);
+ if (REG_P (base_reg) && REG_P (index_reg))
+ help_find_defs (insn_entry, uid,
+ base_reg, index_reg, insn_base, insn_index);
+ }
+ }
+}
+
+/* Return non-zero if and only if USE represents a compile-time constant. */
+static bool
+represents_constant_p (df_ref use)
+{
+ struct df_link *def_link = DF_REF_CHAIN (use);
+
+ /* If there is no definition, or the definition is
+ artificial, or there are multiple definitions, this
+ is an originating use. */
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref) || def_link->next)
+ return false;
+ else
+ {
+ rtx def_insn = DF_REF_INSN (def_link->ref);
+ /* unsigned uid = INSN_UID (def_insn); not needed? */
+ rtx body = PATTERN (def_insn);
+ if (GET_CODE (body) == CONST_INT)
+ return true;
+ else if (GET_CODE (body) == SET)
+ {
+ /* recurse on the use that defines this variable */
+ struct df_insn_info *inner_insn_info = DF_INSN_INFO_GET (def_insn);
+ df_ref inner_use;
+ FOR_EACH_INSN_INFO_USE (inner_use, inner_insn_info)
+ {
+ if (!represents_constant_p (inner_use))
+ return false;
+ }
+ /* There were multiple defs but they are all constant. */
+ return true;
+ }
+ else /* treat unrecognized codes as not constant */
+ return false;
+ }
+}
+
+
+/* An originator represents the first point at which the value of
+ DEF_LINK is derived from potentially more than one input
+ definition, or the point at which DEF_LINK's value is defined by an
+ algebraic expression involving only constants,
+
+ If DEF_LINK's value depends on a constant combined with a single
+ variable or a simple propagation of a single variable, continue
+ the search for the originator by examining the origin of the source
+ variable's value.
+
+ The value of *ADJUSTMENT is overwritten with the constant value that is
+ added to the originator expression to obtain the value intended to
+ be represented by DEF_LINK. In the case that find_true_originator
+ returns NULL, the value held in *ADJUSTMENT is undefined.
+
+ Returns NULL if there is no single true originator. In general, the search
+ for an originator expression only spans SET operations that are
+ based on simple algebraic expressions, each of which involves no
+ more than one variable input. */
+static rtx
+find_true_originator (struct df_link *def_link, long long int *adjustment)
+{
+ rtx def_insn = DF_REF_INSN (def_link->ref);
+
+ rtx inner_body = PATTERN (def_insn);
+ if (GET_CODE (inner_body) == SET)
+ {
+ struct df_insn_info *inner_insn_info = DF_INSN_INFO_GET (def_insn);
+ df_ref inner_use;
+
+ /* We're only happy with multiple uses if all but one represent
+ constant values. */
+ int non_constant_uses = 0;
+ rtx result = NULL;
+ FOR_EACH_INSN_INFO_USE (inner_use, inner_insn_info)
+ {
+ if (!represents_constant_p (inner_use))
+ {
+ non_constant_uses++;
+ /* There should be only one non-constant use, and it should
+ satisfy find_true_originator. */
+ struct df_link *def_link = DF_REF_CHAIN (inner_use);
+
+ /* If there is no definition, or the definition is
+ artificial, or there are multiple definitions, this
+ is an originating use. */
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref) || def_link->next)
+ result = def_insn;
+ else
+ result = find_true_originator (def_link, adjustment);
+ }
+ }
+
+ /* If non_constant_uses > 1, the value of result is not well
+ defined because it is overwritten during multiple iterations
+ of the above loop. But in the case that non_constant_uses >
+ 1, we do not make use of the result value. */
+ if (non_constant_uses == 1) {
+
+ /* If my SET looks like a simple register copy, or if it looks
+ like PLUS or MINUS of a constant and a register, this is
+ what we optimize. Otherwise, punt. */
+
+ if (result == NULL)
+ /* Doing constant arithmetic with unknown originator is not
+ useful. */
+ return def_insn;
+
+ rtx source_expr = SET_SRC (inner_body);
+ int source_code = GET_CODE (source_expr);
+ if (source_code == PLUS)
+ {
+ rtx op1 = XEXP (source_expr, 0);
+ rtx op2 = XEXP (source_expr, 1);
+
+ if ((GET_CODE (op1) == CONST_INT) && (GET_CODE (op2) != CONST_INT))
+ *adjustment += INTVAL (op1);
+ else if ((GET_CODE (op1) != CONST_INT)
+ && (GET_CODE (op2) == CONST_INT))
+ *adjustment += INTVAL (op2);
+ }
+ else if (source_code == MINUS)
+ {
+ rtx op1 = XEXP (source_expr, 0);
+ rtx op2 = XEXP (source_expr, 1);
+
+ if ((GET_CODE (op1) != CONST_INT) && (GET_CODE (op2) == CONST_INT))
+ *adjustment -= INTVAL (op1);
+ else
+ /* assumption is that *adjustment is added to a positive variable
+ expression, so don't optimize this rare condition. */
+ result = def_insn;
+ }
+ else if (source_code != REG)
+ /* We don't handle ashift, UNSPEC, etc.. */
+ result = def_insn;
+ /* else, register copy expression does not impact adjustment. */
+
+ return result;
+ }
+ else
+ /* Same behavior if there are too many non-constant inputs, or if
+ all inputs are constant. */
+ return def_insn;
+ }
+ else
+ /* This is not a SET. It does not serve as a true originator. */
+ return NULL;
+}
+
+/* The size of the insn_entry array. Note that this array does not
+ represent instructions created during this optimization pass. */
+static unsigned int max_uid_at_start;
+
+static bool
+in_use_list (struct df_link *list, struct df_link *element)
+{
+ while (list != NULL)
+ {
+ if (element->ref == list->ref)
+ return true;
+ list = list->next;
+ }
+ /* Got to end of list without finding element. */
+ return false;
+}
+
+/* Return true iff the instruction represented by uid_1 is in the same
+ equivalence class as the instruction represented by uid_2. */
+static bool
+equivalent_p (indexing_web_entry *insn_entry,
+ unsigned int uid_1, unsigned int uid_2)
+{
+ if ((insn_entry[uid_1].base_equivalence_hash !=
+ insn_entry[uid_2].base_equivalence_hash) ||
+ (insn_entry[uid_1].index_equivalence_hash !=
+ insn_entry[uid_2].index_equivalence_hash))
+ return false;
+ else /* Hash codes match. Check details. */
+ {
+ rtx_insn *insn_1, *insn_2;
+ insn_1 = insn_entry[uid_1].insn;
+ insn_2 = insn_entry[uid_2].insn;
+
+ struct df_link *insn1_base_defs, *insn1_index_defs;
+ struct df_link *insn2_base_defs, *insn2_index_defs;
+
+ find_defs (insn_entry, insn_1, &insn1_base_defs, &insn1_index_defs);
+ find_defs (insn_entry, insn_2, &insn2_base_defs, &insn2_index_defs);
+
+ int base_count_1 = count_links (insn1_base_defs);
+ int index_count_1 = count_links (insn1_index_defs);
+ int base_count_2 = count_links (insn2_base_defs);
+ int index_count_2 = count_links (insn2_index_defs);
+
+ if ((base_count_1 != base_count_2) || (index_count_1 != index_count_2))
+ return false;
+
+ /* Counts are the same. Make sure elements match. */
+ while (insn1_base_defs != NULL)
+ {
+ if (!in_use_list (insn2_base_defs, insn1_base_defs))
+ return false;
+ insn1_base_defs = insn1_base_defs->next;
+ }
+ /* base patterns match, but stil need to consider index matches. */
+ while (insn1_index_defs != NULL)
+ {
+ if (!in_use_list (insn2_index_defs, insn1_index_defs))
+ return false;
+ insn1_index_defs = insn1_index_defs->next;
+ }
+
+ if (insn_entry [uid_1].original_base_reg
+ != insn_entry [uid_2].original_base_reg)
+ return false;
+ else if (insn_entry [uid_1].original_index_reg
+ != insn_entry [uid_2].original_index_reg)
+ return false;
+ else
+ return true;
+ }
+}
+
+/* Return true iff instruction I2 dominates instruction I1. */
+static bool
+insn_dominated_by_p (rtx_insn *i1, rtx_insn *i2)
+{
+ basic_block bb1 = BLOCK_FOR_INSN (i1);
+ basic_block bb2 = BLOCK_FOR_INSN (i2);
+
+ if (bb1 == bb2)
+ {
+ for (rtx_insn *i = i2;
+ (i != NULL) && (BLOCK_FOR_INSN (i) == bb1); i = NEXT_INSN (i))
+ if (i == i1)
+ return true;
+ return false;
+ }
+ else
+ return dominated_by_p (CDI_DOMINATORS, bb1, bb2);
+}
+
+static void
+build_and_fixup_equivalence_classes (indexing_web_entry *insn_entry)
+{
+ unsigned int i;
+ unsigned int *equivalence_hash =
+ (unsigned int *) alloca (max_uid_at_start * sizeof (unsigned int));
+
+ for (i = 0; i < max_uid_at_start; i++)
+ equivalence_hash [i] = UINT_MAX;
+
+ for (unsigned int uid = 0; uid < max_uid_at_start; uid++)
+ {
+ if (insn_entry [uid].is_relevant)
+ {
+ int hash = ((insn_entry [uid].base_equivalence_hash +
+ insn_entry [uid].index_equivalence_hash)
+ % max_uid_at_start);
+
+ if (equivalence_hash [hash] == UINT_MAX)
+ { /* first mention of this class */
+ equivalence_hash [hash] = uid;
+ insn_entry [uid].equivalent_sibling = UINT_MAX;
+ }
+ else
+ {
+ while ((equivalence_hash [hash] != UINT_MAX)
+ && !equivalent_p (insn_entry, uid,
+ equivalence_hash [hash]))
+ hash = (hash + 1) % max_uid_at_start;
+
+ /* either we have found an equivalent instruction, or
+ the equivalence class does not yet exist. */
+ if (equivalence_hash [hash] != UINT_MAX)
+ {
+ insn_entry [uid].equivalent_sibling =
+ equivalence_hash [hash];
+ equivalence_hash [hash] = uid;
+ }
+ else /* Equivalence class doesn't yet exist. */
+ {
+ equivalence_hash [hash] = uid;
+ insn_entry [uid].equivalent_sibling = UINT_MAX;
+ }
+ }
+ }
+ }
+
+ for (unsigned int i = 0; i < max_uid_at_start; i++)
+ {
+ while (equivalence_hash [i] != UINT_MAX)
+ {
+ unsigned int the_dominator = equivalence_hash [i];
+ unsigned int uid;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Equivalence class consists of\n");
+
+ for (uid = the_dominator; uid != UINT_MAX;
+ uid = insn_entry [uid].equivalent_sibling)
+ {
+ if (insn_dominated_by_p (insn_entry [the_dominator].insn,
+ insn_entry [uid].insn))
+ the_dominator = uid;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, " member: %d\n", uid);
+ }
+
+ int removed_partition = UINT_MAX;
+ int size_of_equivalence = 0;
+ unsigned int preceding_uid = UINT_MAX;
+ unsigned int next_uid;
+
+ /* Having found a dominator, remove from this equivalence
+ class any element that is not dominated by the_dominator. */
+ for (uid = equivalence_hash [i]; uid != UINT_MAX; uid = next_uid)
+ {
+ next_uid = insn_entry [uid].equivalent_sibling;
+ if (!insn_dominated_by_p (insn_entry [uid].insn,
+ insn_entry [the_dominator].insn))
+ {
+ /* insn uid thinks its in this equivalence class, but
+ it's not dominated by the_dominator, so remove it. */
+ insn_entry [uid].equivalent_sibling = removed_partition;
+ removed_partition = uid;
+ if (preceding_uid == UINT_MAX)
+ equivalence_hash [i] = next_uid;
+ else
+ insn_entry [preceding_uid].equivalent_sibling = next_uid;
+ }
+ else
+ {
+ size_of_equivalence++;
+ preceding_uid = uid;
+ }
+ }
+
+ int replacement_count = size_of_equivalence;
+ /* Confirm that everything in the equivalence class is
+ eligible for representation as a d-form insn. Otherwise,
+ remove additional entries from the equivalence class. */
+ long long int dominator_delta =
+ (insn_entry [the_dominator].base_delta
+ + insn_entry [the_dominator].index_delta);
+ for (uid = equivalence_hash [i]; uid != UINT_MAX;
+ uid = insn_entry [uid].equivalent_sibling)
+ {
+ if (uid != the_dominator)
+ {
+ long long int dominated_delta =
+ (insn_entry [uid].base_delta
+ + insn_entry [uid].index_delta);
+ dominated_delta -= dominator_delta;
+
+ rtx_insn *insn = insn_entry [uid].insn;
+ rtx body = PATTERN (insn);
+ rtx mem;
+
+ gcc_assert (GET_CODE (body) == SET);
+
+ if (MEM_P (SET_SRC (body))) /* load */
+ {
+ mem = SET_SRC (body);
+ if (!rs6000_target_supports_dform_offset_p
+ (GET_MODE (mem), dominated_delta))
+ replacement_count--;
+ }
+ else
+ {
+ mem = SET_DEST (body); /* store */
+ if (!rs6000_target_supports_dform_offset_p
+ (GET_MODE (mem), dominated_delta))
+ replacement_count--;
+ }
+ }
+ }
+
+ if (replacement_count > 1)
+ {
+ /* First, fix up the_dominator instruction. */
+ rtx derived_ptr_reg = gen_reg_rtx (Pmode);
+ rtx_insn *insn = insn_entry [the_dominator].insn;
+ rtx body = PATTERN (insn);
+ rtx base_reg, index_reg;
+ rtx addr, mem;
+ rtx new_init_expr;
+ /* not used: rtx new_delta_expr = NULL; */
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Endeavoring to replace "
+ "originating insn %d: ", the_dominator);
+ print_inline_rtx (dump_file, insn, 2);
+ fprintf (dump_file, "\n");
+ }
+
+ gcc_assert (GET_CODE (body) == SET);
+ if (MEM_P (SET_SRC (body)))
+ {
+ /* originating instruction is a load */
+ mem = SET_SRC (body);
+ addr = XEXP (SET_SRC (body), 0);
+ }
+ else
+ { /* originating instruction is a store */
+ gcc_assert (MEM_P (SET_DEST (body)));
+ mem = SET_DEST (body);
+ addr = XEXP (SET_DEST (body), 0);
+ }
+
+ enum rtx_code code = GET_CODE (addr);
+ gcc_assert ((code == PLUS) || (code == MINUS));
+ base_reg = XEXP (addr, 0);
+ index_reg = XEXP (addr, 1);
+
+ if (code == PLUS)
+ new_init_expr = gen_rtx_PLUS (Pmode, base_reg, index_reg);
+ else
+ new_init_expr = gen_rtx_MINUS (Pmode, base_reg, index_reg);
+ new_init_expr = gen_rtx_SET (derived_ptr_reg, new_init_expr);
+
+ rtx_insn *new_insn = emit_insn_before (new_init_expr, insn);
+ set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
+ INSN_CODE (new_insn) = -1; /* force re-recogniition. */
+ df_insn_rescan (new_insn);
+
+ if (dump_file) {
+ fprintf (dump_file, "with insn %d: ", INSN_UID (new_insn));
+ print_inline_rtx (dump_file, new_insn, 2);
+ fprintf (dump_file, "\n");
+ }
+
+ /* If dominator_delta != 0, we need to make adjustments
+ for dominator_delta in the D-form constant offsets
+ associated with the propagating instructions. */
+
+ rtx new_mem = gen_rtx_MEM (GET_MODE (mem), derived_ptr_reg);
+ MEM_COPY_ATTRIBUTES (new_mem, mem);
+ rtx new_expr;
+ if (insn_entry [the_dominator].is_load)
+ new_expr = gen_rtx_SET (SET_DEST (body), new_mem);
+ else
+ new_expr = gen_rtx_SET (new_mem, SET_SRC (body));
+
+ if (!validate_change (insn, &PATTERN(insn), new_expr, false))
+ { /* proposed change was rejected */
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "Proposed dform optimization was"
+ " rejected by validate_change\n");
+ print_inline_rtx (dump_file, new_insn, 2);
+ fprintf (dump_file, "\n");
+ }
+ }
+ else if (dump_file)
+ {
+ fprintf (dump_file, "and with insn %d: ",
+ INSN_UID (insn));
+ print_inline_rtx (dump_file, insn, 2);
+ fprintf (dump_file, "\n");
+ }
+
+ for (uid = equivalence_hash [i]; uid != UINT_MAX;
+ uid = insn_entry [uid].equivalent_sibling)
+ {
+ if (uid != the_dominator)
+ {
+ long long int dominated_delta =
+ (insn_entry [uid].base_delta
+ + insn_entry [uid].index_delta);
+ dominated_delta -= dominator_delta;
+
+ rtx_insn *insn = insn_entry [uid].insn;
+ rtx body = PATTERN (insn);
+ rtx mem;
+
+ if (dump_file) {
+ fprintf (dump_file, "Endeavoring to replace "
+ "propagating insn %d: ", uid);
+ print_inline_rtx (dump_file, insn, 2);
+ fprintf (dump_file, "\n");
+ }
+
+ gcc_assert (GET_CODE (body) == SET);
+
+ if (MEM_P (SET_SRC (body))) /* load */
+ mem = SET_SRC (body);
+ else
+ mem = SET_DEST (body); /* store */
+
+ rtx ci = gen_rtx_raw_CONST_INT (Pmode, dominated_delta);
+ rtx addr_expr = gen_rtx_PLUS (Pmode,
+ derived_ptr_reg, ci);
+ rtx new_mem = gen_rtx_MEM (GET_MODE (mem), addr_expr);
+ MEM_COPY_ATTRIBUTES (new_mem, mem);
+
+ rtx new_expr;
+ if (insn_entry [uid].is_load)
+ new_expr = gen_rtx_SET (SET_DEST (body), new_mem);
+ else
+ new_expr = gen_rtx_SET (new_mem, SET_SRC (body));
+
+ if (!validate_change (insn, &PATTERN(insn),
+ new_expr, false))
+ { /* proposed change was rejected */
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "Proposed dform optimization was"
+ " rejected by validate_change\n");
+ print_inline_rtx (dump_file, new_expr, 2);
+ fprintf (dump_file, "\n");
+ }
+ }
+ else if (dump_file)
+ {
+ fprintf (dump_file,
+ "with insn %d: ", INSN_UID (insn));
+ print_inline_rtx (dump_file, insn, 2);
+ fprintf (dump_file, "\n");
+ }
+ }
+ }
+ }
+ else if (dump_file)
+ {
+ fprintf (dump_file,
+ "Abandoning dform optimization: too few dform insns\n");
+ }
+
+ /* if (removed_partition !- UINT_MAX), need to reprocess the
+ contents of the removed_partition. There may be
+ additional opportunity to optimize within the set of
+ insns that were not dominated by the selected dominator.
+
+ Each time through this loop, at least one dominator and
+ any instructions it dominates are "processed". Anything
+ not dominated by the selected dominator remains in the
+ "removed partition". The "removed partition" gets
+ smaller on each iteration, assuring eventual termination. */
+ equivalence_hash [i] = removed_partition;
+ }
+ }
+}
+
+static void
+attempt_transform (rtx mem, rtx_insn *insn, indexing_web_entry *insn_entry)
+{
+ unsigned int uid = INSN_UID (insn);
+ rtx base_reg = XEXP (mem,0);
+ rtx index_reg = XEXP (mem, 1);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, " memory is base + index, ");
+ fprintf (dump_file, "base: ");
+ print_inline_rtx (dump_file, base_reg, 2);
+ fprintf (dump_file, "\n index: ");
+ print_inline_rtx (dump_file, index_reg, 2);
+ fprintf (dump_file, "\n");
+ }
+
+ if (REG_P (base_reg) && REG_P (index_reg))
+ {
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ /* Since insn is known to represent a sum or
+ difference, this insn is likely to use at
+ least two input variables. */
+
+ int num_base_defs = 0;
+ int num_index_defs = 0;
+ bool base_is_relevant = false;
+ bool index_is_relevant = false;
+ bool base_is_originating = false;
+ bool index_is_originating = false;
+ df_ref use;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ if (rtx_equal_p (DF_REF_REG (use), base_reg))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file,
+ "Found use corresponding to base_reg\n");
+ df_ref_debug (use, dump_file);
+ }
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ num_base_defs++;
+
+ /* If there is no definition, or the definition is artificial, or
+ there are multiple definitions, this is originating use. */
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref) || def_link->next)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Use is originating!\n");
+
+ base_is_relevant = true;
+ base_is_originating = true;
+ insn_entry[uid].original_base_reg = REGNO (base_reg);
+ insn_entry[uid].original_base_use = uid;
+ insn_entry[uid].base_delta = 0;
+ insn_entry[uid].base_equivalence_hash
+ = equivalence_hash (def_link);
+ }
+ else
+ {
+ /* Only one definition. Dig deeper. */
+ long long int delta = 0;
+ rtx insn2 = find_true_originator (def_link, &delta);
+ if (insn2)
+ {
+ unsigned uid2 = INSN_UID (insn2);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Use may propagate from %d\n",
+ uid2);
+
+ struct df_insn_info *insn_info2 =
+ DF_INSN_INFO_GET (insn2);
+
+ df_ref use2;
+ if (insn_info2)
+ use2 = DF_INSN_INFO_USES (insn_info2);
+ else
+ use2 = NULL;
+
+ if (!use2 || !DF_REF_NEXT_LOC (use2))
+ {
+ base_is_originating = false;
+
+ rtx body = PATTERN (insn2);
+ gcc_assert (GET_CODE (body) == SET);
+ gcc_assert (REG_P (SET_DEST (body)));
+
+ insn_entry[uid].original_base_reg
+ = REGNO (SET_DEST (body));
+ insn_entry[uid].original_base_use = uid2;
+ insn_entry[uid].base_delta = delta;
+
+ if (use2)
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use2);
+ base_is_relevant = true;
+ insn_entry[uid].base_equivalence_hash
+ = equivalence_hash (def_link);
+ }
+ /* else, base is not relevant. */
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ " propagates from originating insn %d"
+ " with delta: %lld\n", uid2, delta);
+ }
+ else if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, " Dependencies are too"
+ " complicated for this optimization\n");
+ }
+ }
+ }
+ else if (rtx_equal_p (DF_REF_REG (use), index_reg))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Found use corresponding to index\n");
+ df_ref_debug (use, dump_file);
+ }
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ num_index_defs++;
+
+ /* If there is no definition, or the definition
+ is artificial, or there are multiple
+ definitions, this is an originating use. */
+ if (!def_link || !def_link->ref
+ || DF_REF_IS_ARTIFICIAL (def_link->ref) || def_link->next)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Use is originating!\n");
+
+ index_is_relevant = true;
+ index_is_originating = true;
+ insn_entry[uid].original_index_reg = REGNO (index_reg);
+ insn_entry[uid].original_index_use = uid;
+ insn_entry[uid].index_delta = 0;
+ insn_entry[uid].index_equivalence_hash
+ = equivalence_hash (def_link);
+ }
+ else
+ {
+ /* Only one definition. Dig deeper. */
+ long long int delta = 0;
+ rtx insn2 = find_true_originator (def_link, &delta);
+
+ if (insn2)
+ {
+ unsigned int uid2 = INSN_UID (insn2);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Use may propagate from %d\n",
+ uid2);
+
+ struct df_insn_info *insn_info2
+ = DF_INSN_INFO_GET (insn2);
+
+ df_ref use2;
+ if (insn_info2)
+ use2 = DF_INSN_INFO_USES (insn_info2);
+ else
+ use2 = NULL;
+
+ if (!use2 || !DF_REF_NEXT_LOC (use2))
+ {
+ index_is_originating = false;
+
+ rtx body = PATTERN (insn2);
+ gcc_assert (GET_CODE (body) == SET);
+ gcc_assert (REG_P (SET_DEST (body)));
+
+ insn_entry[uid].original_index_reg
+ = REGNO (SET_DEST(body));
+ insn_entry[uid].original_index_use = uid2;
+ insn_entry[uid].index_delta = delta;
+
+ if (use2)
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use2);
+ index_is_relevant = true;
+ insn_entry[uid].index_equivalence_hash
+ = equivalence_hash (def_link);
+ }
+ /* else, index is not relevant. */
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, " propagates from originating "
+ "insn %d, with delta %lld\n",
+ uid2, delta);
+ }
+ else if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, " Dependencies are too complicated"
+ " for this optimization\n");
+ }
+ }
+ }
+ }
+
+ /* This insn is only relevant if there is exactly one defintion of
+ base and one definition of index and they are both considered to
+ be relevant. */
+ if ((num_base_defs == 1) && (num_index_defs == 1) &&
+ base_is_relevant && index_is_relevant)
+ {
+ insn_entry[uid].is_relevant = true;
+ insn_entry[uid].is_originating =
+ (base_is_originating && index_is_originating);
+ }
+ else if (dump_file)
+ {
+ fprintf (dump_file,
+ "Rejecting dform optimization of insn %d\n", uid);
+ if (num_base_defs != 1)
+ fprintf (dump_file, "Too %s (%d) base definitions\n",
+ (num_base_defs > 1)? "many": "few", num_base_defs);
+ if (num_index_defs != 1)
+ fprintf (dump_file, "Too %s (%d) index definitions\n",
+ (num_index_defs > 1)? "many": "few", num_index_defs);
+ if (!base_is_relevant)
+ fprintf (dump_file,
+ "The available base definition is not relevant\n");
+ if (!index_is_relevant)
+ fprintf (dump_file,
+ "The available index definition is not relevant\n");
+ }
+ }
+ else if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, " punting because base or index not registers\n");
+}
+
+
+/* Main entry point for this pass. */
+unsigned int
+rs6000_insert_dform (function *fun)
+{
+ basic_block bb;
+ rtx_insn *insn, *curr_insn = 0;
+ indexing_web_entry *insn_entry;
+
+ calculate_dominance_info (CDI_DOMINATORS);
+
+ /* Dataflow analysis for use-def chains. */
+ df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+ df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+ df_analyze ();
+
+ /* Since this pass creates new instructions, get_max_uid () may
+ return different values at different times during this pass. The
+ insn_entry array does not represent any of the instructions newly
+ inserted during this pass. */
+ max_uid_at_start = get_max_uid ();
+ insn_entry = XCNEWVEC (indexing_web_entry, max_uid_at_start);
+
+ if (dump_file) {
+ fprintf (dump_file, "Creating insn_entry array with %d entries\n",
+ max_uid_at_start);
+ }
+
+ /* The general approach is to:
+
+ 1. Look for multiple array indexing expressions that refer to
+ the same array base address such as are represented by the
+ rtl excerpts below..
+
+ 2. Group these into subsets for which the indexing expression
+ derives from the same initial_value + some accumulation of
+ constant values added thereto.
+
+ (cinsn 2 (set (reg/v/f:DI <27> [ x ])
+ (reg:DI 3 [ x ])) "ddot-c.c":12
+ (expr_list:REG_DEAD (reg:DI 3 [ x ])))
+
+ ...
+
+ (cinsn 31 (set (reg:V2DF <35> [ vect__3.7 ])
+ (mem:V2DF (plus:DI (reg/v/f:DI <27> [ x ])
+ (reg:DI <9> [ ivtmp.18 ]))
+ [1 MEM[base: x_20(D), index: ivtmp.18_35,
+ offset: 0B]+0 S16 A64])) "ddot-c.c":18)
+
+ ...
+
+ (cinsn 304 (set (reg:DI <70>)
+ (plus:DI (reg:DI <9> [ ivtmp.18 ])
+ (const_int 16)))
+ (expr_list:REG_DEAD (reg:DI <9> [ ivtmp.18 ])))
+ (cinsn 34 (set (reg:DI <9> [ ivtmp.18 ])
+ (reg:DI <70>)))
+
+ */
+
+ /* Walk the insns to gather basic data. */
+ FOR_ALL_BB_FN (bb, fun)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Scrutinizing bb %d\n", bb->index);
+
+ FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
+ {
+ unsigned int uid = INSN_UID (insn);
+
+ insn_entry[uid].insn = insn;
+ insn_entry[uid].bb = BLOCK_FOR_INSN (insn);
+ insn_entry[uid].is_relevant = false;
+
+ if (dump_file && (dump_flags & TDF_DETAILS)) {
+ fprintf (dump_file, "\nLooking at insn: %d\n", uid);
+ df_dump_insn_top (insn, dump_file);
+ dump_insn_slim (dump_file, insn);
+ df_dump_insn_bottom (insn, dump_file);
+ }
+
+ /*
+ * First, look for all memory[base + index] expressions.
+ * Then group these by base.
+ * Then for all instructions in each group, scrutinize the index
+ * definition. Partition this group according to the origin
+ * variable upon which the the definitions of i are based.
+ *
+ * How do we define "origin variable"?
+ *
+ * If i has multiple definitions, it is its own origin
+ * variable. Likewise, if i has a single definition and the
+ * definition is NOT the sum or difference of a constant value
+ * and some other variable, then i is its own origin variable.
+ *
+ * Otherwise, i has the same origin variable as the expression
+ * that represents its definition.
+ *
+ * After we've created these partitions, for each partition
+ * whose size is greater than 1:
+ *
+ * 1. introduce derived_ptr = base + origin_variable
+ * immediately following the instruction that defines
+ * origin_variable.
+ *
+ * 2. for each member of the partition, replace the expression
+ * memory [base + index] with derived_ptr [constant], where
+ * constant is the sum of all constant values added to the
+ * origin variable to represent this particular value of i.
+ */
+ if (NONDEBUG_INSN_P (insn))
+ {
+ rtx body = PATTERN (insn);
+ rtx mem;
+ if ((GET_CODE (body) == SET) && MEM_P (SET_SRC (body)))
+ {
+ mem = XEXP (SET_SRC (body), 0);
+ insn_entry[uid].is_load = true;
+ insn_entry[uid].is_store = false;
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file,
+ " this insn is fetching data from memory: ");
+ print_inline_rtx (dump_file, mem, 2);
+ fprintf (dump_file, "\n");
+ }
+ }
+ else if ((GET_CODE (body) == SET) && MEM_P (SET_DEST (body)))
+ {
+ mem = XEXP (SET_DEST (body), 0);
+ insn_entry[uid].is_load = false;
+ insn_entry[uid].is_store = true;
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file,
+ " this insn is storing data to memory: ");
+ print_inline_rtx (dump_file, mem, 2);
+ fprintf (dump_file, "\n");
+ }
+ }
+ else
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ " this insn is neither load nor store\n");
+ continue; /* Not a load or store */
+ }
+
+ enum rtx_code code = GET_CODE (mem);
+ if ((code == PLUS) || (code == MINUS))
+ attempt_transform (mem, insn, insn_entry);
+ else if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ " address not sum or difference of values\n");
+ }
+ /* else, this is a DEBUG_INSN_P (insn) so ignore it. */
+ }
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "\n");
+ }
+
+ build_and_fixup_equivalence_classes (insn_entry);
+ free_dominance_info (CDI_DOMINATORS);
+ return 0;
+} // anon namespace
+
+
+const pass_data pass_data_insert_dform =
+{
+ RTL_PASS, /* type */
+ "dform", /* name */
+ OPTGROUP_NONE, /* optinfo_flags, or could use OPTGROUP_LOOP */
+ TV_NONE, /* tv_id, or could use TV_LOOP_UNROLL */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_insert_dform : public rtl_opt_pass
+{
+public:
+ pass_insert_dform(gcc::context *ctxt)
+ : rtl_opt_pass(pass_data_insert_dform, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ // This is most relevant to P9 targets since that architecture
+ // introduces new D-form instructions, but this may pay off on
+ // other architectures as well. Might want to experiment.
+ return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
+ && TARGET_P9_VECTOR);
+ }
+
+ virtual unsigned int execute (function *fun)
+ {
+ return rs6000_insert_dform (fun);
+ }
+
+ opt_pass *clone ()
+ {
+ return new pass_insert_dform (m_ctxt);
+ }
+
+}; // class pass_insert_dform
+
+rtl_opt_pass *make_pass_insert_dform (gcc::context *ctxt)
+{
+ return new pass_insert_dform (ctxt);
+}
===================================================================
@@ -22,6 +22,8 @@ along with GCC; see the file COPYING3. If not see
INSERT_PASS_AFTER (PASS, INSTANCE, TGT_PASS)
INSERT_PASS_BEFORE (PASS, INSTANCE, TGT_PASS)
REPLACE_PASS (PASS, INSTANCE, TGT_PASS)
+ Be advised: gawk program does not parse C comments if inserted below.
*/
INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
+ INSERT_PASS_AFTER (pass_loop2, 1, pass_insert_dform);
===================================================================
@@ -47,6 +47,8 @@ extern bool legitimate_indirect_address_p (rtx, in
extern bool legitimate_indexed_address_p (rtx, int);
extern bool avoiding_indexed_address_p (machine_mode);
extern rtx rs6000_force_indexed_or_indirect_mem (rtx x);
+extern bool rs6000_target_supports_dform_offset_p (machine_mode,
+ HOST_WIDE_INT);
extern rtx rs6000_got_register (rtx);
extern rtx find_addr_reg (rtx);
@@ -246,6 +248,8 @@ namespace gcc { class context; }
class rtl_opt_pass;
extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
+extern rtl_opt_pass *make_pass_insert_dform (gcc::context *);
+
extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
extern bool rs6000_quadword_masked_address_p (const_rtx exp);
extern rtx rs6000_gen_lvx (enum machine_mode, rtx, rtx);
===================================================================
@@ -8725,6 +8725,150 @@ rs6000_debug_legitimate_address_p (machine_mode mo
return ret;
}
+/* This function provides an approximation of which d-form addressing
+ expressions are valid on any given target configuration. This
+ approximation guides optimization choices. Secondary validation
+ of the addressing mode is performed before code generation.
+
+ Return true iff target has instructions to perform a memory
+ operation at the specified BYTE_OFFSET from an address held
+ in a general purpose register. */
+bool
+rs6000_target_supports_dform_offset_p (machine_mode mode,
+ HOST_WIDE_INT byte_offset)
+{
+ const HOST_WIDE_INT max_16bit_signed = (0x7fffffff);
+ const HOST_WIDE_INT min_16bit_signed = -1 - max_16bit_signed;
+
+ /* Available d-form instructions with P1 (the original Power architecture):
+
+ lbz RT,D(RA) - load byte and zero d-form
+ lhz RT,D(RA) - load half word and zero d-form
+ lha RT,D(RA) - load half word algebraic d-form
+ lwz RT,D(RA) - load word and zero d-form
+ lfs FRT,D(RA) - load floating-point single d-form
+ lfd FRT,D(RA) - load floating-point double d-form
+
+ stb RS,D(RA) - store byte d-form
+ sth RS,D(RA) - store half word d-form
+ stfs FRS,D(RA) - store floating point single d-form
+ stfd FRS,D(RA) - store floating point double d-form */
+
+ /* Available d-form instructions with PPC (prior to v2.00):
+ (option mpowerpc "existed in the past" but is now "always on"
+
+ lwa RT,DS(RA) - load word algebraic ds-form (2 bottom bits zero)
+ ld RT,DS(RA) - load double word ds-form (2 bottom bits zero)
+
+ std RS,DS(RA) - store double word ds-form (2 bottom bits zero)
+
+ Consider lwa redundant with insn available in prior processors. */
+ switch (mode)
+ {
+ case E_QImode:
+ case E_HImode:
+ case E_SImode:
+ case E_SFmode:
+ case E_DFmode:
+ if (IN_RANGE (byte_offset, min_16bit_signed, max_16bit_signed))
+ return true;
+ break;
+
+ case E_DImode:
+ if (IN_RANGE (byte_offset, min_16bit_signed, max_16bit_signed)
+ && ((byte_offset & 0x03) == 0))
+ return true;
+ break;
+
+ default:
+ ; /* Fall through to see if other instructions will work. */
+
+ }
+
+ /* Available d-form instructionw with v2.03:
+
+ lq RTp,DQ(RA) - load quadword dq-form (4 bottom bits zero)
+
+ stq RSp,DS(RA) - store quadword ds-form (2 bottom bits zero)
+
+ These instructions are not recommended for general use as they
+ are expected to be very inefficient. Their design was apparently
+ motivated by a need to support atomic quad-word access, which is
+ difficult to implement even in hardware on some architectures.
+ Furthermore, the design of these instructions apparently does the
+ "wrong" thing with regards to swapping of double words on load
+ and store for little-endian targets.
+
+ Therefore, this routine assumes v2.03 does NOT support quadword
+ d-form addressing. */
+
+ /* Available d-form instructions with v2.05
+
+ (There are some floating-point load and store double-pair
+ instructions. Consider them "not available". There are
+ described as phasing out, which means they are expected
+ to have poor performance.) */
+
+ /* Available d-form instructions with 3.0
+
+ lxsd VRT,DS(RA) - Load VSX scalar double word ds-form (2 bottom bits zero)
+ (redundant with lfd from P1)
+ lxssp VRT,DS(RA) - Load VSX scalar single precision ds-form
+ (bottom 2 bits zero)
+ (redundant with lfs from P1)
+ lxv XT,DQ(RA) - Load VSX Vector dq-form (4 bottom bits zero)
+ (Works on little endian for any element type, but
+ does not preserve lanes.)
+
+ stxsd VRS,DS(RA) - Store VSX scalar double-word DS form
+ (bottom 2 bits zero)
+ (redundant with stfd from P1)
+ stxssp VRS,DS(RA) - Store VSX scalar single precision DS-form
+ (bottom 2 bits zero)
+ (redundant with stfs from P1)
+ stxv XS,DQ(RA) - Store VSX vector dq-form (4 bottom bits zero)
+ (Works on little endian for any element type,
+ but does not preserve lanes.)
+
+ lxv and stxv load/store to/from any VSX register, including
+ registers that overlay with floating point and altivec register
+ sets. */
+
+ if (rs6000_isa_flags & OPTION_MASK_MODULO) /* ISA 3.0 */
+ {
+ switch (mode)
+ {
+ case E_V16QImode:
+ case E_V8HImode:
+ case E_V4SFmode:
+ case E_V4SImode:
+ case E_V2DFmode:
+ case E_V2DImode:
+ case E_V1TImode:
+ case E_TImode:
+
+ case E_KFmode: /* ieee 754 128-bit floating point */
+ case E_IFmode: /* IBM extended 128-bit double */
+ case E_TFmode: /* 128-bit double (form depends on
+ gcc command line, which may be
+ either -mabi=ieeelongdouble (KF)
+ or -mabi=ibmlongdouble (IF). */
+ /* All 128-bit loads and stores are handled by lxv and stxv. */
+ if (IN_RANGE (byte_offset, min_16bit_signed, max_16bit_signed)
+ && ((byte_offset & 0x0f) == 0))
+ return true;
+ break;
+
+ default:
+ ; /* fall through to see if other instructions will work. */
+ }
+ }
+
+ /* Add modeling support for newly introduced instructions here. */
+
+ return false;
+}
+
/* Implement TARGET_MODE_DEPENDENT_ADDRESS_P. */
static bool
===================================================================
@@ -47,6 +47,10 @@ rs6000-call.o: $(srcdir)/config/rs6000/rs6000-call
$(COMPILE) $<
$(POSTCOMPILE)
+rs6000-p9dform.o: $(srcdir)/config/rs6000/rs6000-p9dform.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
$(srcdir)/config/rs6000/rs6000-tables.opt: $(srcdir)/config/rs6000/genopt.sh \
$(srcdir)/config/rs6000/rs6000-cpus.def
$(SHELL) $(srcdir)/config/rs6000/genopt.sh $(srcdir)/config/rs6000 > \
===================================================================
@@ -499,7 +499,7 @@ or1k*-*-*)
;;
powerpc*-*-*)
cpu_type=rs6000
- extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-call.o"
+ extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-p9dform.o rs6000-logue.o rs6000-call.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
===================================================================
@@ -0,0 +1,44 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -mtune=power9 -funroll-loops" } */
+/* This test confirms that the dform instructions are selected in the
+ translation of this main program. */
+
+extern void first_dummy ();
+extern void dummy (double sacc, int n);
+extern void other_dummy ();
+
+extern float opt_value;
+extern char *opt_desc;
+
+#define M 128
+#define N 512
+
+double x [N];
+double y [N];
+
+int main (int argc, char *argv []) {
+ double sacc;
+
+ first_dummy ();
+ for (int j = 0; j < M; j++) {
+
+ sacc = 0.00;
+ for (unsigned long long int i = 0; i < N; i++) {
+ sacc += x[i] * y[i];
+ }
+ dummy (sacc, N);
+ }
+ opt_value = ((float) N) * 2 * ((float) M);
+ opt_desc = "flops";
+ other_dummy ();
+}
+
+/* At time the dform optimization pass was merged with trunk, 12
+ lxv instructions were emitted in place of the same number of lxvx
+ instructions. No need to require exactly this number, as it may
+ change when other optimization passes evolve. */
+
+/* { dg-final { scan-assembler {\mlxv\M} } } */
===================================================================
@@ -0,0 +1,58 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -mtune=power9 -funroll-loops" } */
+
+
+/* This test confirms that the dform instructions are selected in the
+ translation of this main program. */
+
+extern void first_dummy ();
+extern void dummy (double sacc, int n);
+extern void other_dummy ();
+
+extern float opt_value;
+extern char *opt_desc;
+
+#define M 128
+#define N 512
+
+double x [N];
+double y [N];
+double z [N];
+
+int main (int argc, char *argv []) {
+ double sacc;
+
+ first_dummy ();
+ for (int j = 0; j < M; j++) {
+
+ sacc = 0.00;
+ for (unsigned long long int i = 0; i < N; i++) {
+ z[i] = x[i] * y[i];
+ sacc += z[i];
+ }
+ dummy (sacc, N);
+ }
+ opt_value = ((float) N) * 2 * ((float) M);
+ opt_desc = "flops";
+ other_dummy ();
+}
+
+
+
+/* At time the dform optimization pass was merged with trunk, 12
+ lxv instructions were emitted in place of the same number of lxvx
+ instructions. No need to require exactly this number, as it may
+ change when other optimization passes evolve. */
+
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+
+/* At time the dform optimization pass was merged with trunk, 6
+ stxv instructions were emitted in place of the same number of stxvx
+ instructions. No need to require exactly this number, as it may
+ change when other optimization passes evolve. */
+
+/* { dg-final { scan-assembler {\mstxv\M} } } */
+
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE signed int
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE unsigned long long
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mld\M} } } */
+/* { dg-final { scan-assembler {\mstd\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE signed long long
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mld\M} } } */
+/* { dg-final { scan-assembler {\mstd\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE unsigned __int128
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mld\M} } } */
+/* { dg-final { scan-assembler {\mstd\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE signed __int128
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mld\M} } } */
+/* { dg-final { scan-assembler {\mstd\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie -mfloat128" } */
+
+#define TYPE __float128
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+
+#define TYPE float
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE double
+#include "p9-dform-generic.h"
+
+/* At time the dform optimization pass was merged with trunk, 6
+ lxv instructions were emitted in place of the same number of lxvx
+ instructions and 8 stxv instructions replace the same number of
+ stxvx instructions. No need to require exactly this number, as it
+ may change when other optimization passes evolve. */
+
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE long double
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlfd\M} } } */
+/* { dg-final { scan-assembler {\mstfd\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE unsigned char
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE signed char
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE unsigned short
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE signed short
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" "-mdejagnu-cpu=*" } { "-mdejagnu-cpu=power9" } } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mdejagnu-cpu=power9 -funroll-loops -Wall -no-pie" } */
+
+#define TYPE unsigned int
+#include "p9-dform-generic.h"
+
+/* The precise number of lxv and stxv instructions may be impacted by
+ complex interactions between optimization passes, but we expect at
+ least one of each. */
+/* { dg-final { scan-assembler {\mlxv\M} } } */
+/* { dg-final { scan-assembler {\mstxv\M} } } */
===================================================================
@@ -0,0 +1,34 @@
+
+#define ITERATIONS 1000000
+
+#define SIZE (16384/sizeof(TYPE))
+
+static TYPE x[SIZE] __attribute__ ((aligned (16)));
+static TYPE y[SIZE] __attribute__ ((aligned (16)));
+static TYPE a;
+
+void obfuscate(void *a, ...);
+
+static void __attribute__((noinline)) do_one(void)
+{
+ unsigned long i;
+
+ obfuscate(x, y, &a);
+
+ for (i = 0; i < SIZE; i++)
+ y[i] = a * x[i];
+
+ obfuscate(x, y, &a);
+
+}
+
+int main(void)
+{
+ unsigned long i;
+
+ for (i = 0; i < ITERATIONS; i++)
+ do_one();
+
+ return 0;
+
+}