[rs6000] reorganize p8 swap optimization code into a separate file.
diff mbox

Message ID 1499372649.6283.4.camel@linux.vnet.ibm.com
State New
Headers show

Commit Message

Aaron Sawdey July 6, 2017, 8:24 p.m. UTC
This patch is simple after the toc_relative_expr_p patch I put in last
week removed the need to access the tocrel_base/tocrel_offset static
variables. Basically it moves all the machinery used to optimize p8
little endian vector code by removing unneeded byte swaps to a new file
rs6000-p8swap.c. This code is only needed for p8 so separating it keeps
it out of the way in the future.

Bootstrap/regtest passed, patch pre-approved by segher and committed as
250040.

2017-07-06  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>

	* config/rs6000/rs6000.c (union_defs, union_uses, insn_is_load_p,
	insn_is_store_p, insn_is_swap_p, const_load_sequence_p, v2df_reduction_p,
	rtx_is_swappable_p, insn_is_swappable_p, chain_contains_only_swaps,
	mark_swaps_for_removal, swap_const_vector_halves, adjust_subreg_index,
	permute_load, permute_store, adjust_extract, adjust_splat, 
	adjust_xxpermdi, adjust_concat, adjust_vperm, handle_special_swappables,
	replace_swap_with_copy, dump_swap_insn_table, 
	alignment_with_canonical_addr, alignment_mask, find_alignment_op,
	recombine_lvx_pattern, recombine_stvx_pattern, 
	recombine_lvx_stvx_patterns, rs6000_analyze_swaps, 
	make_pass_analyze_swaps): Move all code related to p8 swap optimizations
	to file rs6000-p8swap.c.
	* config/rs6000/rs6000-p8swap.c: New file.
	* config/rs6000/t-rs6000: Add rule to build rs6000-p8swap.o.
	* config.gcc: Add rs6000-p8swap.o to extra_objs for powerpc*-*-*
	and rs6000*-*-* targets.

Patch
diff mbox

Index: gcc/config/rs6000/rs6000-p8swap.c
===================================================================
--- gcc/config/rs6000/rs6000-p8swap.c	(revision 0)
+++ gcc/config/rs6000/rs6000-p8swap.c	(working copy)
@@ -0,0 +1,1892 @@ 
+/* Subroutines used to remove unnecessary doubleword swaps
+   for p8 little-endian VSX code.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "df.h"
+#include "tm_p.h"
+#include "ira.h"
+#include "print-tree.h"
+#include "varasm.h"
+#include "explow.h"
+#include "expr.h"
+#include "output.h"
+#include "tree-pass.h"
+
+/* Analyze vector computations and remove unnecessary doubleword
+   swaps (xxswapdi instructions).  This pass is performed only
+   for little-endian VSX code generation.
+
+   For this specific case, loads and stores of 4x32 and 2x64 vectors
+   are inefficient.  These are implemented using the lvx2dx and
+   stvx2dx instructions, which invert the order of doublewords in
+   a vector register.  Thus the code generation inserts an xxswapdi
+   after each such load, and prior to each such store.  (For spill
+   code after register assignment, an additional xxswapdi is inserted
+   following each store in order to return a hard register to its
+   unpermuted value.)
+
+   The extra xxswapdi instructions reduce performance.  This can be
+   particularly bad for vectorized code.  The purpose of this pass
+   is to reduce the number of xxswapdi instructions required for
+   correctness.
+
+   The primary insight is that much code that operates on vectors
+   does not care about the relative order of elements in a register,
+   so long as the correct memory order is preserved.  If we have
+   a computation where all input values are provided by lvxd2x/xxswapdi
+   sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
+   and all intermediate computations are pure SIMD (independent of
+   element order), then all the xxswapdi's associated with the loads
+   and stores may be removed.
+
+   This pass uses some of the infrastructure and logical ideas from
+   the "web" pass in web.c.  We create maximal webs of computations
+   fitting the description above using union-find.  Each such web is
+   then optimized by removing its unnecessary xxswapdi instructions.
+
+   The pass is placed prior to global optimization so that we can
+   perform the optimization in the safest and simplest way possible;
+   that is, by replacing each xxswapdi insn with a register copy insn.
+   Subsequent forward propagation will remove copies where possible.
+
+   There are some operations sensitive to element order for which we
+   can still allow the operation, provided we modify those operations.
+   These include CONST_VECTORs, for which we must swap the first and
+   second halves of the constant vector; and SUBREGs, for which we
+   must adjust the byte offset to account for the swapped doublewords.
+   A remaining opportunity would be non-immediate-form splats, for
+   which we should adjust the selected lane of the input.  We should
+   also make code generation adjustments for sum-across operations,
+   since this is a common vectorizer reduction.
+
+   Because we run prior to the first split, we can see loads and stores
+   here that match *vsx_le_perm_{load,store}_<mode>.  These are vanilla
+   vector loads and stores that have not yet been split into a permuting
+   load/store and a swap.  (One way this can happen is with a builtin
+   call to vec_vsx_{ld,st}.)  We can handle these as well, but rather
+   than deleting a swap, we convert the load/store into a permuting
+   load/store (which effectively removes the swap).  */
+
+/* Notes on Permutes
+
+   We do not currently handle computations that contain permutes.  There
+   is a general transformation that can be performed correctly, but it
+   may introduce more expensive code than it replaces.  To handle these
+   would require a cost model to determine when to perform the optimization.
+   This commentary records how this could be done if desired.
+
+   The most general permute is something like this (example for V16QI):
+
+   (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
+                     (parallel [(const_int a0) (const_int a1)
+                                 ...
+                                (const_int a14) (const_int a15)]))
+
+   where a0,...,a15 are in [0,31] and select elements from op1 and op2
+   to produce in the result.
+
+   Regardless of mode, we can convert the PARALLEL to a mask of 16
+   byte-element selectors.  Let's call this M, with M[i] representing
+   the ith byte-element selector value.  Then if we swap doublewords
+   throughout the computation, we can get correct behavior by replacing
+   M with M' as follows:
+
+    M'[i] = { (M[i]+8)%16      : M[i] in [0,15]
+            { ((M[i]+8)%16)+16 : M[i] in [16,31]
+
+   This seems promising at first, since we are just replacing one mask
+   with another.  But certain masks are preferable to others.  If M
+   is a mask that matches a vmrghh pattern, for example, M' certainly
+   will not.  Instead of a single vmrghh, we would generate a load of
+   M' and a vperm.  So we would need to know how many xxswapd's we can
+   remove as a result of this transformation to determine if it's
+   profitable; and preferably the logic would need to be aware of all
+   the special preferable masks.
+
+   Another form of permute is an UNSPEC_VPERM, in which the mask is
+   already in a register.  In some cases, this mask may be a constant
+   that we can discover with ud-chains, in which case the above
+   transformation is ok.  However, the common usage here is for the
+   mask to be produced by an UNSPEC_LVSL, in which case the mask 
+   cannot be known at compile time.  In such a case we would have to
+   generate several instructions to compute M' as above at run time,
+   and a cost model is needed again.
+
+   However, when the mask M for an UNSPEC_VPERM is loaded from the
+   constant pool, we can replace M with M' as above at no cost
+   beyond adding a constant pool entry.  */
+
+/* This is based on the union-find logic in web.c.  web_entry_base is
+   defined in df.h.  */
+class swap_web_entry : public web_entry_base
+{
+ public:
+  /* Pointer to the insn.  */
+  rtx_insn *insn;
+  /* Set if insn contains a mention of a vector register.  All other
+     fields are undefined if this field is unset.  */
+  unsigned int is_relevant : 1;
+  /* Set if insn is a load.  */
+  unsigned int is_load : 1;
+  /* Set if insn is a store.  */
+  unsigned int is_store : 1;
+  /* Set if insn is a doubleword swap.  This can either be a register swap
+     or a permuting load or store (test is_load and is_store for this).  */
+  unsigned int is_swap : 1;
+  /* Set if the insn has a live-in use of a parameter register.  */
+  unsigned int is_live_in : 1;
+  /* Set if the insn has a live-out def of a return register.  */
+  unsigned int is_live_out : 1;
+  /* Set if the insn contains a subreg reference of a vector register.  */
+  unsigned int contains_subreg : 1;
+  /* Set if the insn contains a 128-bit integer operand.  */
+  unsigned int is_128_int : 1;
+  /* Set if this is a call-insn.  */
+  unsigned int is_call : 1;
+  /* Set if this insn does not perform a vector operation for which
+     element order matters, or if we know how to fix it up if it does.
+     Undefined if is_swap is set.  */
+  unsigned int is_swappable : 1;
+  /* A nonzero value indicates what kind of special handling for this
+     insn is required if doublewords are swapped.  Undefined if
+     is_swappable is not set.  */
+  unsigned int special_handling : 4;
+  /* Set if the web represented by this entry cannot be optimized.  */
+  unsigned int web_not_optimizable : 1;
+  /* Set if this insn should be deleted.  */
+  unsigned int will_delete : 1;
+};
+
+enum special_handling_values {
+  SH_NONE = 0,
+  SH_CONST_VECTOR,
+  SH_SUBREG,
+  SH_NOSWAP_LD,
+  SH_NOSWAP_ST,
+  SH_EXTRACT,
+  SH_SPLAT,
+  SH_XXPERMDI,
+  SH_CONCAT,
+  SH_VPERM
+};
+
+/* Union INSN with all insns containing definitions that reach USE.
+   Detect whether USE is live-in to the current function.  */
+static void
+union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
+{
+  struct df_link *link = DF_REF_CHAIN (use);
+
+  if (!link)
+    insn_entry[INSN_UID (insn)].is_live_in = 1;
+
+  while (link)
+    {
+      if (DF_REF_IS_ARTIFICIAL (link->ref))
+	insn_entry[INSN_UID (insn)].is_live_in = 1;
+
+      if (DF_REF_INSN_INFO (link->ref))
+	{
+	  rtx def_insn = DF_REF_INSN (link->ref);
+	  (void)unionfind_union (insn_entry + INSN_UID (insn),
+				 insn_entry + INSN_UID (def_insn));
+	}
+
+      link = link->next;
+    }
+}
+
+/* Union INSN with all insns containing uses reached from DEF.
+   Detect whether DEF is live-out from the current function.  */
+static void
+union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
+{
+  struct df_link *link = DF_REF_CHAIN (def);
+
+  if (!link)
+    insn_entry[INSN_UID (insn)].is_live_out = 1;
+
+  while (link)
+    {
+      /* This could be an eh use or some other artificial use;
+	 we treat these all the same (killing the optimization).  */
+      if (DF_REF_IS_ARTIFICIAL (link->ref))
+	insn_entry[INSN_UID (insn)].is_live_out = 1;
+
+      if (DF_REF_INSN_INFO (link->ref))
+	{
+	  rtx use_insn = DF_REF_INSN (link->ref);
+	  (void)unionfind_union (insn_entry + INSN_UID (insn),
+				 insn_entry + INSN_UID (use_insn));
+	}
+
+      link = link->next;
+    }
+}
+
+/* Return 1 iff INSN is a load insn, including permuting loads that
+   represent an lvxd2x instruction; else return 0.  */
+static unsigned int
+insn_is_load_p (rtx insn)
+{
+  rtx body = PATTERN (insn);
+
+  if (GET_CODE (body) == SET)
+    {
+      if (GET_CODE (SET_SRC (body)) == MEM)
+	return 1;
+
+      if (GET_CODE (SET_SRC (body)) == VEC_SELECT
+	  && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM)
+	return 1;
+
+      return 0;
+    }
+
+  if (GET_CODE (body) != PARALLEL)
+    return 0;
+
+  rtx set = XVECEXP (body, 0, 0);
+
+  if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM)
+    return 1;
+
+  return 0;
+}
+
+/* Return 1 iff INSN is a store insn, including permuting stores that
+   represent an stvxd2x instruction; else return 0.  */
+static unsigned int
+insn_is_store_p (rtx insn)
+{
+  rtx body = PATTERN (insn);
+  if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM)
+    return 1;
+  if (GET_CODE (body) != PARALLEL)
+    return 0;
+  rtx set = XVECEXP (body, 0, 0);
+  if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM)
+    return 1;
+  return 0;
+}
+
+/* Return 1 iff INSN swaps doublewords.  This may be a reg-reg swap,
+   a permuting load, or a permuting store.  */
+static unsigned int
+insn_is_swap_p (rtx insn)
+{
+  rtx body = PATTERN (insn);
+  if (GET_CODE (body) != SET)
+    return 0;
+  rtx rhs = SET_SRC (body);
+  if (GET_CODE (rhs) != VEC_SELECT)
+    return 0;
+  rtx parallel = XEXP (rhs, 1);
+  if (GET_CODE (parallel) != PARALLEL)
+    return 0;
+  unsigned int len = XVECLEN (parallel, 0);
+  if (len != 2 && len != 4 && len != 8 && len != 16)
+    return 0;
+  for (unsigned int i = 0; i < len / 2; ++i)
+    {
+      rtx op = XVECEXP (parallel, 0, i);
+      if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
+	return 0;
+    }
+  for (unsigned int i = len / 2; i < len; ++i)
+    {
+      rtx op = XVECEXP (parallel, 0, i);
+      if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
+	return 0;
+    }
+  return 1;
+}
+
+/* Return TRUE if insn is a swap fed by a load from the constant pool.  */
+static bool
+const_load_sequence_p (swap_web_entry *insn_entry, rtx insn)
+{
+  unsigned uid = INSN_UID (insn);
+  if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
+    return false;
+
+  const_rtx tocrel_base;
+
+  /* Find the unique use in the swap and locate its def.  If the def
+     isn't unique, punt.  */
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  df_ref use;
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      if (!def_link || def_link->next)
+	return false;
+
+      rtx def_insn = DF_REF_INSN (def_link->ref);
+      unsigned uid2 = INSN_UID (def_insn);
+      if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
+	return false;
+
+      rtx body = PATTERN (def_insn);
+      if (GET_CODE (body) != SET
+	  || GET_CODE (SET_SRC (body)) != VEC_SELECT
+	  || GET_CODE (XEXP (SET_SRC (body), 0)) != MEM)
+	return false;
+
+      rtx mem = XEXP (SET_SRC (body), 0);
+      rtx base_reg = XEXP (mem, 0);
+
+      df_ref base_use;
+      insn_info = DF_INSN_INFO_GET (def_insn);
+      FOR_EACH_INSN_INFO_USE (base_use, insn_info)
+	{
+	  if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
+	    continue;
+
+	  struct df_link *base_def_link = DF_REF_CHAIN (base_use);
+	  if (!base_def_link || base_def_link->next)
+	    return false;
+
+	  rtx tocrel_insn = DF_REF_INSN (base_def_link->ref);
+	  rtx tocrel_body = PATTERN (tocrel_insn);
+	  rtx base, offset;
+	  if (GET_CODE (tocrel_body) != SET)
+	    return false;
+	  /* There is an extra level of indirection for small/large
+	     code models.  */
+	  rtx tocrel_expr = SET_SRC (tocrel_body);
+	  if (GET_CODE (tocrel_expr) == MEM)
+	    tocrel_expr = XEXP (tocrel_expr, 0);
+	  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
+	    return false;
+	  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
+	  if (GET_CODE (base) != SYMBOL_REF || !CONSTANT_POOL_ADDRESS_P (base))
+	    return false;
+	}
+    }
+  return true;
+}
+
+/* Return TRUE iff OP matches a V2DF reduction pattern.  See the
+   definition of vsx_reduc_<VEC_reduc_name>_v2df in vsx.md.  */
+static bool
+v2df_reduction_p (rtx op)
+{
+  if (GET_MODE (op) != V2DFmode)
+    return false;
+  
+  enum rtx_code code = GET_CODE (op);
+  if (code != PLUS && code != SMIN && code != SMAX)
+    return false;
+
+  rtx concat = XEXP (op, 0);
+  if (GET_CODE (concat) != VEC_CONCAT)
+    return false;
+
+  rtx select0 = XEXP (concat, 0);
+  rtx select1 = XEXP (concat, 1);
+  if (GET_CODE (select0) != VEC_SELECT || GET_CODE (select1) != VEC_SELECT)
+    return false;
+
+  rtx reg0 = XEXP (select0, 0);
+  rtx reg1 = XEXP (select1, 0);
+  if (!rtx_equal_p (reg0, reg1) || !REG_P (reg0))
+    return false;
+
+  rtx parallel0 = XEXP (select0, 1);
+  rtx parallel1 = XEXP (select1, 1);
+  if (GET_CODE (parallel0) != PARALLEL || GET_CODE (parallel1) != PARALLEL)
+    return false;
+
+  if (!rtx_equal_p (XVECEXP (parallel0, 0, 0), const1_rtx)
+      || !rtx_equal_p (XVECEXP (parallel1, 0, 0), const0_rtx))
+    return false;
+
+  return true;
+}
+
+/* Return 1 iff OP is an operand that will not be affected by having
+   vector doublewords swapped in memory.  */
+static unsigned int
+rtx_is_swappable_p (rtx op, unsigned int *special)
+{
+  enum rtx_code code = GET_CODE (op);
+  int i, j;
+  rtx parallel;
+
+  switch (code)
+    {
+    case LABEL_REF:
+    case SYMBOL_REF:
+    case CLOBBER:
+    case REG:
+      return 1;
+
+    case VEC_CONCAT:
+    case ASM_INPUT:
+    case ASM_OPERANDS:
+      return 0;
+
+    case CONST_VECTOR:
+      {
+	*special = SH_CONST_VECTOR;
+	return 1;
+      }
+
+    case VEC_DUPLICATE:
+      /* Opportunity: If XEXP (op, 0) has the same mode as the result,
+	 and XEXP (op, 1) is a PARALLEL with a single QImode const int,
+	 it represents a vector splat for which we can do special
+	 handling.  */
+      if (GET_CODE (XEXP (op, 0)) == CONST_INT)
+	return 1;
+      else if (REG_P (XEXP (op, 0))
+	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+	/* This catches V2DF and V2DI splat, at a minimum.  */
+	return 1;
+      else if (GET_CODE (XEXP (op, 0)) == TRUNCATE
+	       && REG_P (XEXP (XEXP (op, 0), 0))
+	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+	/* This catches splat of a truncated value.  */
+	return 1;
+      else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
+	/* If the duplicated item is from a select, defer to the select
+	   processing to see if we can change the lane for the splat.  */
+	return rtx_is_swappable_p (XEXP (op, 0), special);
+      else
+	return 0;
+
+    case VEC_SELECT:
+      /* A vec_extract operation is ok if we change the lane.  */
+      if (GET_CODE (XEXP (op, 0)) == REG
+	  && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
+	  && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+	  && XVECLEN (parallel, 0) == 1
+	  && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
+	{
+	  *special = SH_EXTRACT;
+	  return 1;
+	}
+      /* An XXPERMDI is ok if we adjust the lanes.  Note that if the
+	 XXPERMDI is a swap operation, it will be identified by
+	 insn_is_swap_p and therefore we won't get here.  */
+      else if (GET_CODE (XEXP (op, 0)) == VEC_CONCAT
+	       && (GET_MODE (XEXP (op, 0)) == V4DFmode
+		   || GET_MODE (XEXP (op, 0)) == V4DImode)
+	       && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+	       && XVECLEN (parallel, 0) == 2
+	       && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT
+	       && GET_CODE (XVECEXP (parallel, 0, 1)) == CONST_INT)
+	{
+	  *special = SH_XXPERMDI;
+	  return 1;
+	}
+      else if (v2df_reduction_p (op))
+	return 1;
+      else
+	return 0;
+
+    case UNSPEC:
+      {
+	/* Various operations are unsafe for this optimization, at least
+	   without significant additional work.  Permutes are obviously
+	   problematic, as both the permute control vector and the ordering
+	   of the target values are invalidated by doubleword swapping.
+	   Vector pack and unpack modify the number of vector lanes.
+	   Merge-high/low will not operate correctly on swapped operands.
+	   Vector shifts across element boundaries are clearly uncool,
+	   as are vector select and concatenate operations.  Vector
+	   sum-across instructions define one operand with a specific
+	   order-dependent element, so additional fixup code would be
+	   needed to make those work.  Vector set and non-immediate-form
+	   vector splat are element-order sensitive.  A few of these
+	   cases might be workable with special handling if required.
+	   Adding cost modeling would be appropriate in some cases.  */
+	int val = XINT (op, 1);
+	switch (val)
+	  {
+	  default:
+	    break;
+	  case UNSPEC_VMRGH_DIRECT:
+	  case UNSPEC_VMRGL_DIRECT:
+	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
+	  case UNSPEC_VPACK_SIGN_UNS_SAT:
+	  case UNSPEC_VPACK_UNS_UNS_MOD:
+	  case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
+	  case UNSPEC_VPACK_UNS_UNS_SAT:
+	  case UNSPEC_VPERM:
+	  case UNSPEC_VPERM_UNS:
+	  case UNSPEC_VPERMHI:
+	  case UNSPEC_VPERMSI:
+	  case UNSPEC_VPKPX:
+	  case UNSPEC_VSLDOI:
+	  case UNSPEC_VSLO:
+	  case UNSPEC_VSRO:
+	  case UNSPEC_VSUM2SWS:
+	  case UNSPEC_VSUM4S:
+	  case UNSPEC_VSUM4UBS:
+	  case UNSPEC_VSUMSWS:
+	  case UNSPEC_VSUMSWS_DIRECT:
+	  case UNSPEC_VSX_CONCAT:
+	  case UNSPEC_VSX_SET:
+	  case UNSPEC_VSX_SLDWI:
+	  case UNSPEC_VUNPACK_HI_SIGN:
+	  case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
+	  case UNSPEC_VUNPACK_LO_SIGN:
+	  case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
+	  case UNSPEC_VUPKHPX:
+	  case UNSPEC_VUPKHS_V4SF:
+	  case UNSPEC_VUPKHU_V4SF:
+	  case UNSPEC_VUPKLPX:
+	  case UNSPEC_VUPKLS_V4SF:
+	  case UNSPEC_VUPKLU_V4SF:
+	  case UNSPEC_VSX_CVDPSPN:
+	  case UNSPEC_VSX_CVSPDP:
+	  case UNSPEC_VSX_CVSPDPN:
+	  case UNSPEC_VSX_EXTRACT:
+	  case UNSPEC_VSX_VSLO:
+	  case UNSPEC_VSX_VEC_INIT:
+	    return 0;
+	  case UNSPEC_VSPLT_DIRECT:
+	  case UNSPEC_VSX_XXSPLTD:
+	    *special = SH_SPLAT;
+	    return 1;
+	  case UNSPEC_REDUC_PLUS:
+	  case UNSPEC_REDUC:
+	    return 1;
+	  }
+      }
+
+    default:
+      break;
+    }
+
+  const char *fmt = GET_RTX_FORMAT (code);
+  int ok = 1;
+
+  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+    if (fmt[i] == 'e' || fmt[i] == 'u')
+      {
+	unsigned int special_op = SH_NONE;
+	ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
+	if (special_op == SH_NONE)
+	  continue;
+	/* Ensure we never have two kinds of special handling
+	   for the same insn.  */
+	if (*special != SH_NONE && *special != special_op)
+	  return 0;
+	*special = special_op;
+      }
+    else if (fmt[i] == 'E')
+      for (j = 0; j < XVECLEN (op, i); ++j)
+	{
+	  unsigned int special_op = SH_NONE;
+	  ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
+	  if (special_op == SH_NONE)
+	    continue;
+	  /* Ensure we never have two kinds of special handling
+	     for the same insn.  */
+	  if (*special != SH_NONE && *special != special_op)
+	    return 0;
+	  *special = special_op;
+	}
+
+  return ok;
+}
+
+/* Return 1 iff INSN is an operand that will not be affected by
+   having vector doublewords swapped in memory (in which case
+   *SPECIAL is unchanged), or that can be modified to be correct
+   if vector doublewords are swapped in memory (in which case
+   *SPECIAL is changed to a value indicating how).  */
+static unsigned int
+insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
+		     unsigned int *special)
+{
+  /* Calls are always bad.  */
+  if (GET_CODE (insn) == CALL_INSN)
+    return 0;
+
+  /* Loads and stores seen here are not permuting, but we can still
+     fix them up by converting them to permuting ones.  Exceptions:
+     UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
+     body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
+     for the SET source.  Also we must now make an exception for lvx
+     and stvx when they are not in the UNSPEC_LVX/STVX form (with the
+     explicit "& -16") since this leads to unrecognizable insns.  */
+  rtx body = PATTERN (insn);
+  int i = INSN_UID (insn);
+
+  if (insn_entry[i].is_load)
+    {
+      if (GET_CODE (body) == SET)
+	{
+	  rtx rhs = SET_SRC (body);
+	  /* Even without a swap, the RHS might be a vec_select for, say,
+	     a byte-reversing load.  */
+	  if (GET_CODE (rhs) != MEM)
+	    return 0;
+	  if (GET_CODE (XEXP (rhs, 0)) == AND)
+	    return 0;
+
+	  *special = SH_NOSWAP_LD;
+	  return 1;
+	}
+      else
+	return 0;
+    }
+
+  if (insn_entry[i].is_store)
+    {
+      if (GET_CODE (body) == SET
+	  && GET_CODE (SET_SRC (body)) != UNSPEC)
+	{
+	  rtx lhs = SET_DEST (body);
+	  /* Even without a swap, the LHS might be a vec_select for, say,
+	     a byte-reversing store.  */
+	  if (GET_CODE (lhs) != MEM)
+	    return 0;
+	  if (GET_CODE (XEXP (lhs, 0)) == AND)
+	    return 0;
+	  
+	  *special = SH_NOSWAP_ST;
+	  return 1;
+	}
+      else
+	return 0;
+    }
+
+  /* A convert to single precision can be left as is provided that
+     all of its uses are in xxspltw instructions that splat BE element
+     zero.  */
+  if (GET_CODE (body) == SET
+      && GET_CODE (SET_SRC (body)) == UNSPEC
+      && XINT (SET_SRC (body), 1) == UNSPEC_VSX_CVDPSPN)
+    {
+      df_ref def;
+      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+
+      FOR_EACH_INSN_INFO_DEF (def, insn_info)
+	{
+	  struct df_link *link = DF_REF_CHAIN (def);
+	  if (!link)
+	    return 0;
+
+	  for (; link; link = link->next) {
+	    rtx use_insn = DF_REF_INSN (link->ref);
+	    rtx use_body = PATTERN (use_insn);
+	    if (GET_CODE (use_body) != SET
+		|| GET_CODE (SET_SRC (use_body)) != UNSPEC
+		|| XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW
+		|| XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx)
+	      return 0;
+	  }
+	}
+
+      return 1;
+    }
+
+  /* A concatenation of two doublewords is ok if we reverse the
+     order of the inputs.  */
+  if (GET_CODE (body) == SET
+      && GET_CODE (SET_SRC (body)) == VEC_CONCAT
+      && (GET_MODE (SET_SRC (body)) == V2DFmode
+	  || GET_MODE (SET_SRC (body)) == V2DImode))
+    {
+      *special = SH_CONCAT;
+      return 1;
+    }
+
+  /* V2DF reductions are always swappable.  */
+  if (GET_CODE (body) == PARALLEL)
+    {
+      rtx expr = XVECEXP (body, 0, 0);
+      if (GET_CODE (expr) == SET
+	  && v2df_reduction_p (SET_SRC (expr)))
+	return 1;
+    }
+
+  /* An UNSPEC_VPERM is ok if the mask operand is loaded from the
+     constant pool.  */
+  if (GET_CODE (body) == SET
+      && GET_CODE (SET_SRC (body)) == UNSPEC
+      && XINT (SET_SRC (body), 1) == UNSPEC_VPERM
+      && XVECLEN (SET_SRC (body), 0) == 3
+      && GET_CODE (XVECEXP (SET_SRC (body), 0, 2)) == REG)
+    {
+      rtx mask_reg = XVECEXP (SET_SRC (body), 0, 2);
+      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+      df_ref use;
+      FOR_EACH_INSN_INFO_USE (use, insn_info)
+	if (rtx_equal_p (DF_REF_REG (use), mask_reg))
+	  {
+	    struct df_link *def_link = DF_REF_CHAIN (use);
+	    /* Punt if multiple definitions for this reg.  */
+	    if (def_link && !def_link->next &&
+		const_load_sequence_p (insn_entry,
+				       DF_REF_INSN (def_link->ref)))
+	      {
+		*special = SH_VPERM;
+		return 1;
+	      }
+	  }
+    }
+
+  /* Otherwise check the operands for vector lane violations.  */
+  return rtx_is_swappable_p (body, special);
+}
+
+enum chain_purpose { FOR_LOADS, FOR_STORES };
+
+/* Return true if the UD or DU chain headed by LINK is non-empty,
+   and every entry on the chain references an insn that is a
+   register swap.  Furthermore, if PURPOSE is FOR_LOADS, each such
+   register swap must have only permuting loads as reaching defs.
+   If PURPOSE is FOR_STORES, each such register swap must have only
+   register swaps or permuting stores as reached uses.  */
+static bool
+chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
+			   enum chain_purpose purpose)
+{
+  if (!link)
+    return false;
+
+  for (; link; link = link->next)
+    {
+      if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (DF_REF_REG (link->ref))))
+	continue;
+
+      if (DF_REF_IS_ARTIFICIAL (link->ref))
+	return false;
+
+      rtx reached_insn = DF_REF_INSN (link->ref);
+      unsigned uid = INSN_UID (reached_insn);
+      struct df_insn_info *insn_info = DF_INSN_INFO_GET (reached_insn);
+
+      if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
+	  || insn_entry[uid].is_store)
+	return false;
+
+      if (purpose == FOR_LOADS)
+	{
+	  df_ref use;
+	  FOR_EACH_INSN_INFO_USE (use, insn_info)
+	    {
+	      struct df_link *swap_link = DF_REF_CHAIN (use);
+
+	      while (swap_link)
+		{
+		  if (DF_REF_IS_ARTIFICIAL (link->ref))
+		    return false;
+
+		  rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
+		  unsigned uid2 = INSN_UID (swap_def_insn);
+
+		  /* Only permuting loads are allowed.  */
+		  if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
+		    return false;
+
+		  swap_link = swap_link->next;
+		}
+	    }
+	}
+      else if (purpose == FOR_STORES)
+	{
+	  df_ref def;
+	  FOR_EACH_INSN_INFO_DEF (def, insn_info)
+	    {
+	      struct df_link *swap_link = DF_REF_CHAIN (def);
+
+	      while (swap_link)
+		{
+		  if (DF_REF_IS_ARTIFICIAL (link->ref))
+		    return false;
+
+		  rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
+		  unsigned uid2 = INSN_UID (swap_use_insn);
+
+		  /* Permuting stores or register swaps are allowed.  */
+		  if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
+		    return false;
+
+		  swap_link = swap_link->next;
+		}
+	    }
+	}
+    }
+
+  return true;
+}
+
+/* Mark the xxswapdi instructions associated with permuting loads and
+   stores for removal.  Note that we only flag them for deletion here,
+   as there is a possibility of a swap being reached from multiple
+   loads, etc.  */
+static void
+mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
+{
+  rtx insn = insn_entry[i].insn;
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+
+  if (insn_entry[i].is_load)
+    {
+      df_ref def;
+      FOR_EACH_INSN_INFO_DEF (def, insn_info)
+	{
+	  struct df_link *link = DF_REF_CHAIN (def);
+
+	  /* We know by now that these are swaps, so we can delete
+	     them confidently.  */
+	  while (link)
+	    {
+	      rtx use_insn = DF_REF_INSN (link->ref);
+	      insn_entry[INSN_UID (use_insn)].will_delete = 1;
+	      link = link->next;
+	    }
+	}
+    }
+  else if (insn_entry[i].is_store)
+    {
+      df_ref use;
+      FOR_EACH_INSN_INFO_USE (use, insn_info)
+	{
+	  /* Ignore uses for addressability.  */
+	  machine_mode mode = GET_MODE (DF_REF_REG (use));
+	  if (!ALTIVEC_OR_VSX_VECTOR_MODE (mode))
+	    continue;
+
+	  struct df_link *link = DF_REF_CHAIN (use);
+
+	  /* We know by now that these are swaps, so we can delete
+	     them confidently.  */
+	  while (link)
+	    {
+	      rtx def_insn = DF_REF_INSN (link->ref);
+	      insn_entry[INSN_UID (def_insn)].will_delete = 1;
+	      link = link->next;
+	    }
+	}
+    }
+}
+
+/* OP is either a CONST_VECTOR or an expression containing one.
+   Swap the first half of the vector with the second in the first
+   case.  Recurse to find it in the second.  */
+static void
+swap_const_vector_halves (rtx op)
+{
+  int i;
+  enum rtx_code code = GET_CODE (op);
+  if (GET_CODE (op) == CONST_VECTOR)
+    {
+      int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
+      for (i = 0; i < half_units; ++i)
+	{
+	  rtx temp = CONST_VECTOR_ELT (op, i);
+	  CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
+	  CONST_VECTOR_ELT (op, i + half_units) = temp;
+	}
+    }
+  else
+    {
+      int j;
+      const char *fmt = GET_RTX_FORMAT (code);
+      for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+	if (fmt[i] == 'e' || fmt[i] == 'u')
+	  swap_const_vector_halves (XEXP (op, i));
+	else if (fmt[i] == 'E')
+	  for (j = 0; j < XVECLEN (op, i); ++j)
+	    swap_const_vector_halves (XVECEXP (op, i, j));
+    }
+}
+
+/* Find all subregs of a vector expression that perform a narrowing,
+   and adjust the subreg index to account for doubleword swapping.  */
+static void
+adjust_subreg_index (rtx op)
+{
+  enum rtx_code code = GET_CODE (op);
+  if (code == SUBREG
+      && (GET_MODE_SIZE (GET_MODE (op))
+	  < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
+    {
+      unsigned int index = SUBREG_BYTE (op);
+      if (index < 8)
+	index += 8;
+      else
+	index -= 8;
+      SUBREG_BYTE (op) = index;
+    }
+
+  const char *fmt = GET_RTX_FORMAT (code);
+  int i,j;
+  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+    if (fmt[i] == 'e' || fmt[i] == 'u')
+      adjust_subreg_index (XEXP (op, i));
+    else if (fmt[i] == 'E')
+      for (j = 0; j < XVECLEN (op, i); ++j)
+	adjust_subreg_index (XVECEXP (op, i, j));
+}
+
+/* Convert the non-permuting load INSN to a permuting one.  */
+static void
+permute_load (rtx_insn *insn)
+{
+  rtx body = PATTERN (insn);
+  rtx mem_op = SET_SRC (body);
+  rtx tgt_reg = SET_DEST (body);
+  machine_mode mode = GET_MODE (tgt_reg);
+  int n_elts = GET_MODE_NUNITS (mode);
+  int half_elts = n_elts / 2;
+  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
+  int i, j;
+  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
+    XVECEXP (par, 0, i) = GEN_INT (j);
+  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
+    XVECEXP (par, 0, i) = GEN_INT (j);
+  rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
+  SET_SRC (body) = sel;
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Replacing load %d with permuted load\n",
+	     INSN_UID (insn));
+}
+
+/* Convert the non-permuting store INSN to a permuting one.  */
+static void
+permute_store (rtx_insn *insn)
+{
+  rtx body = PATTERN (insn);
+  rtx src_reg = SET_SRC (body);
+  machine_mode mode = GET_MODE (src_reg);
+  int n_elts = GET_MODE_NUNITS (mode);
+  int half_elts = n_elts / 2;
+  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
+  int i, j;
+  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
+    XVECEXP (par, 0, i) = GEN_INT (j);
+  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
+    XVECEXP (par, 0, i) = GEN_INT (j);
+  rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
+  SET_SRC (body) = sel;
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Replacing store %d with permuted store\n",
+	     INSN_UID (insn));
+}
+
+/* Given OP that contains a vector extract operation, adjust the index
+   of the extracted lane to account for the doubleword swap.  */
+static void
+adjust_extract (rtx_insn *insn)
+{
+  rtx pattern = PATTERN (insn);
+  if (GET_CODE (pattern) == PARALLEL)
+    pattern = XVECEXP (pattern, 0, 0);
+  rtx src = SET_SRC (pattern);
+  /* The vec_select may be wrapped in a vec_duplicate for a splat, so
+     account for that.  */
+  rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
+  rtx par = XEXP (sel, 1);
+  int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
+  int lane = INTVAL (XVECEXP (par, 0, 0));
+  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+  XVECEXP (par, 0, 0) = GEN_INT (lane);
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
+}
+
+/* Given OP that contains a vector direct-splat operation, adjust the index
+   of the source lane to account for the doubleword swap.  */
+static void
+adjust_splat (rtx_insn *insn)
+{
+  rtx body = PATTERN (insn);
+  rtx unspec = XEXP (body, 1);
+  int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
+  int lane = INTVAL (XVECEXP (unspec, 0, 1));
+  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+  XVECEXP (unspec, 0, 1) = GEN_INT (lane);
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
+}
+
+/* Given OP that contains an XXPERMDI operation (that is not a doubleword
+   swap), reverse the order of the source operands and adjust the indices
+   of the source lanes to account for doubleword reversal.  */
+static void
+adjust_xxpermdi (rtx_insn *insn)
+{
+  rtx set = PATTERN (insn);
+  rtx select = XEXP (set, 1);
+  rtx concat = XEXP (select, 0);
+  rtx src0 = XEXP (concat, 0);
+  XEXP (concat, 0) = XEXP (concat, 1);
+  XEXP (concat, 1) = src0;
+  rtx parallel = XEXP (select, 1);
+  int lane0 = INTVAL (XVECEXP (parallel, 0, 0));
+  int lane1 = INTVAL (XVECEXP (parallel, 0, 1));
+  int new_lane0 = 3 - lane1;
+  int new_lane1 = 3 - lane0;
+  XVECEXP (parallel, 0, 0) = GEN_INT (new_lane0);
+  XVECEXP (parallel, 0, 1) = GEN_INT (new_lane1);
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Changing lanes for xxpermdi %d\n", INSN_UID (insn));
+}
+
+/* Given OP that contains a VEC_CONCAT operation of two doublewords,
+   reverse the order of those inputs.  */
+static void
+adjust_concat (rtx_insn *insn)
+{
+  rtx set = PATTERN (insn);
+  rtx concat = XEXP (set, 1);
+  rtx src0 = XEXP (concat, 0);
+  XEXP (concat, 0) = XEXP (concat, 1);
+  XEXP (concat, 1) = src0;
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Reversing inputs for concat %d\n", INSN_UID (insn));
+}
+
+/* Given an UNSPEC_VPERM insn, modify the mask loaded from the
+   constant pool to reflect swapped doublewords.  */
+static void
+adjust_vperm (rtx_insn *insn)
+{
+  /* We previously determined that the UNSPEC_VPERM was fed by a
+     swap of a swapping load of a TOC-relative constant pool symbol.
+     Find the MEM in the swapping load and replace it with a MEM for
+     the adjusted mask constant.  */
+  rtx set = PATTERN (insn);
+  rtx mask_reg = XVECEXP (SET_SRC (set), 0, 2);
+
+  /* Find the swap.  */
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  df_ref use;
+  rtx_insn *swap_insn = 0;
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    if (rtx_equal_p (DF_REF_REG (use), mask_reg))
+      {
+	struct df_link *def_link = DF_REF_CHAIN (use);
+	gcc_assert (def_link && !def_link->next);
+	swap_insn = DF_REF_INSN (def_link->ref);
+	break;
+      }
+  gcc_assert (swap_insn);
+  
+  /* Find the load.  */
+  insn_info = DF_INSN_INFO_GET (swap_insn);
+  rtx_insn *load_insn = 0;
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      gcc_assert (def_link && !def_link->next);
+      load_insn = DF_REF_INSN (def_link->ref);
+      break;
+    }
+  gcc_assert (load_insn);
+
+  /* Find the TOC-relative symbol access.  */
+  insn_info = DF_INSN_INFO_GET (load_insn);
+  rtx_insn *tocrel_insn = 0;
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      gcc_assert (def_link && !def_link->next);
+      tocrel_insn = DF_REF_INSN (def_link->ref);
+      break;
+    }
+  gcc_assert (tocrel_insn);
+
+  /* Find the embedded CONST_VECTOR.  We have to call toc_relative_expr_p
+     to set tocrel_base; otherwise it would be unnecessary as we've
+     already established it will return true.  */
+  rtx base, offset;
+  const_rtx tocrel_base;
+  rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
+  /* There is an extra level of indirection for small/large code models.  */
+  if (GET_CODE (tocrel_expr) == MEM)
+    tocrel_expr = XEXP (tocrel_expr, 0);
+  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
+    gcc_unreachable ();
+  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
+  rtx const_vector = get_pool_constant (base);
+  /* With the extra indirection, get_pool_constant will produce the
+     real constant from the reg_equal expression, so get the real
+     constant.  */
+  if (GET_CODE (const_vector) == SYMBOL_REF)
+    const_vector = get_pool_constant (const_vector);
+  gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
+
+  /* Create an adjusted mask from the initial mask.  */
+  unsigned int new_mask[16], i, val;
+  for (i = 0; i < 16; ++i) {
+    val = INTVAL (XVECEXP (const_vector, 0, i));
+    if (val < 16)
+      new_mask[i] = (val + 8) % 16;
+    else
+      new_mask[i] = ((val + 8) % 16) + 16;
+  }
+
+  /* Create a new CONST_VECTOR and a MEM that references it.  */
+  rtx vals = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
+  for (i = 0; i < 16; ++i)
+    XVECEXP (vals, 0, i) = GEN_INT (new_mask[i]);
+  rtx new_const_vector = gen_rtx_CONST_VECTOR (V16QImode, XVEC (vals, 0));
+  rtx new_mem = force_const_mem (V16QImode, new_const_vector);
+  /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
+     can't recognize.  Force the SYMBOL_REF into a register.  */
+  if (!REG_P (XEXP (new_mem, 0))) {
+    rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
+    XEXP (new_mem, 0) = base_reg;
+    /* Move the newly created insn ahead of the load insn.  */
+    rtx_insn *force_insn = get_last_insn ();
+    remove_insn (force_insn);
+    rtx_insn *before_load_insn = PREV_INSN (load_insn);
+    add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn));
+    df_insn_rescan (before_load_insn);
+    df_insn_rescan (force_insn);
+  }
+
+  /* Replace the MEM in the load instruction and rescan it.  */
+  XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
+  INSN_CODE (load_insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (load_insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Adjusting mask for vperm %d\n", INSN_UID (insn));
+}
+
+/* The insn described by INSN_ENTRY[I] can be swapped, but only
+   with special handling.  Take care of that here.  */
+static void
+handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
+{
+  rtx_insn *insn = insn_entry[i].insn;
+  rtx body = PATTERN (insn);
+
+  switch (insn_entry[i].special_handling)
+    {
+    default:
+      gcc_unreachable ();
+    case SH_CONST_VECTOR:
+      {
+	/* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
+	gcc_assert (GET_CODE (body) == SET);
+	rtx rhs = SET_SRC (body);
+	swap_const_vector_halves (rhs);
+	if (dump_file)
+	  fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
+	break;
+      }
+    case SH_SUBREG:
+      /* A subreg of the same size is already safe.  For subregs that
+	 select a smaller portion of a reg, adjust the index for
+	 swapped doublewords.  */
+      adjust_subreg_index (body);
+      if (dump_file)
+	fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
+      break;
+    case SH_NOSWAP_LD:
+      /* Convert a non-permuting load to a permuting one.  */
+      permute_load (insn);
+      break;
+    case SH_NOSWAP_ST:
+      /* Convert a non-permuting store to a permuting one.  */
+      permute_store (insn);
+      break;
+    case SH_EXTRACT:
+      /* Change the lane on an extract operation.  */
+      adjust_extract (insn);
+      break;
+    case SH_SPLAT:
+      /* Change the lane on a direct-splat operation.  */
+      adjust_splat (insn);
+      break;
+    case SH_XXPERMDI:
+      /* Change the lanes on an XXPERMDI operation.  */
+      adjust_xxpermdi (insn);
+      break;
+    case SH_CONCAT:
+      /* Reverse the order of a concatenation operation.  */
+      adjust_concat (insn);
+      break;
+    case SH_VPERM:
+      /* Change the mask loaded from the constant pool for a VPERM.  */
+      adjust_vperm (insn);
+      break;
+    }
+}
+
+/* Find the insn from the Ith table entry, which is known to be a
+   register swap Y = SWAP(X).  Replace it with a copy Y = X.  */
+static void
+replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
+{
+  rtx_insn *insn = insn_entry[i].insn;
+  rtx body = PATTERN (insn);
+  rtx src_reg = XEXP (SET_SRC (body), 0);
+  rtx copy = gen_rtx_SET (SET_DEST (body), src_reg);
+  rtx_insn *new_insn = emit_insn_before (copy, insn);
+  set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
+  df_insn_rescan (new_insn);
+
+  if (dump_file)
+    {
+      unsigned int new_uid = INSN_UID (new_insn);
+      fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
+    }
+
+  df_insn_delete (insn);
+  remove_insn (insn);
+  insn->set_deleted ();
+}
+
+/* Dump the swap table to DUMP_FILE.  */
+static void
+dump_swap_insn_table (swap_web_entry *insn_entry)
+{
+  int e = get_max_uid ();
+  fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
+
+  for (int i = 0; i < e; ++i)
+    if (insn_entry[i].is_relevant)
+      {
+	swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
+	fprintf (dump_file, "%6d %6d  ", i,
+		 pred_entry && pred_entry->insn
+		 ? INSN_UID (pred_entry->insn) : 0);
+	if (insn_entry[i].is_load)
+	  fputs ("load ", dump_file);
+	if (insn_entry[i].is_store)
+	  fputs ("store ", dump_file);
+	if (insn_entry[i].is_swap)
+	  fputs ("swap ", dump_file);
+	if (insn_entry[i].is_live_in)
+	  fputs ("live-in ", dump_file);
+	if (insn_entry[i].is_live_out)
+	  fputs ("live-out ", dump_file);
+	if (insn_entry[i].contains_subreg)
+	  fputs ("subreg ", dump_file);
+	if (insn_entry[i].is_128_int)
+	  fputs ("int128 ", dump_file);
+	if (insn_entry[i].is_call)
+	  fputs ("call ", dump_file);
+	if (insn_entry[i].is_swappable)
+	  {
+	    fputs ("swappable ", dump_file);
+	    if (insn_entry[i].special_handling == SH_CONST_VECTOR)
+	      fputs ("special:constvec ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_SUBREG)
+	      fputs ("special:subreg ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
+	      fputs ("special:load ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
+	      fputs ("special:store ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_EXTRACT)
+	      fputs ("special:extract ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_SPLAT)
+	      fputs ("special:splat ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_XXPERMDI)
+	      fputs ("special:xxpermdi ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_CONCAT)
+	      fputs ("special:concat ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_VPERM)
+	      fputs ("special:vperm ", dump_file);
+	  }
+	if (insn_entry[i].web_not_optimizable)
+	  fputs ("unoptimizable ", dump_file);
+	if (insn_entry[i].will_delete)
+	  fputs ("delete ", dump_file);
+	fputs ("\n", dump_file);
+      }
+  fputs ("\n", dump_file);
+}
+
+/* Return RTX with its address canonicalized to (reg) or (+ reg reg).
+   Here RTX is an (& addr (const_int -16)).  Always return a new copy
+   to avoid problems with combine.  */
+static rtx
+alignment_with_canonical_addr (rtx align)
+{
+  rtx canon;
+  rtx addr = XEXP (align, 0);
+
+  if (REG_P (addr))
+    canon = addr;
+
+  else if (GET_CODE (addr) == PLUS)
+    {
+      rtx addrop0 = XEXP (addr, 0);
+      rtx addrop1 = XEXP (addr, 1);
+
+      if (!REG_P (addrop0))
+	addrop0 = force_reg (GET_MODE (addrop0), addrop0);
+
+      if (!REG_P (addrop1))
+	addrop1 = force_reg (GET_MODE (addrop1), addrop1);
+
+      canon = gen_rtx_PLUS (GET_MODE (addr), addrop0, addrop1);
+    }
+
+  else
+    canon = force_reg (GET_MODE (addr), addr);
+
+  return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16));
+}
+
+/* Check whether an rtx is an alignment mask, and if so, return 
+   a fully-expanded rtx for the masking operation.  */
+static rtx
+alignment_mask (rtx_insn *insn)
+{
+  rtx body = PATTERN (insn);
+
+  if (GET_CODE (body) != SET
+      || GET_CODE (SET_SRC (body)) != AND
+      || !REG_P (XEXP (SET_SRC (body), 0)))
+    return 0;
+
+  rtx mask = XEXP (SET_SRC (body), 1);
+
+  if (GET_CODE (mask) == CONST_INT)
+    {
+      if (INTVAL (mask) == -16)
+	return alignment_with_canonical_addr (SET_SRC (body));
+      else
+	return 0;
+    }
+
+  if (!REG_P (mask))
+    return 0;
+
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  df_ref use;
+  rtx real_mask = 0;
+
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      if (!rtx_equal_p (DF_REF_REG (use), mask))
+	continue;
+
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      if (!def_link || def_link->next)
+	return 0;
+
+      rtx_insn *const_insn = DF_REF_INSN (def_link->ref);
+      rtx const_body = PATTERN (const_insn);
+      if (GET_CODE (const_body) != SET)
+	return 0;
+
+      real_mask = SET_SRC (const_body);
+
+      if (GET_CODE (real_mask) != CONST_INT
+	  || INTVAL (real_mask) != -16)
+	return 0;
+    }
+
+  if (real_mask == 0)
+    return 0;
+
+  return alignment_with_canonical_addr (SET_SRC (body));
+}
+
+/* Given INSN that's a load or store based at BASE_REG, look for a
+   feeding computation that aligns its address on a 16-byte boundary.  */
+static rtx
+find_alignment_op (rtx_insn *insn, rtx base_reg)
+{
+  df_ref base_use;
+  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  rtx and_operation = 0;
+
+  FOR_EACH_INSN_INFO_USE (base_use, insn_info)
+    {
+      if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
+	continue;
+
+      struct df_link *base_def_link = DF_REF_CHAIN (base_use);
+      if (!base_def_link || base_def_link->next)
+	break;
+
+      /* With stack-protector code enabled, and possibly in other
+	 circumstances, there may not be an associated insn for 
+	 the def.  */
+      if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
+	break;
+
+      rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref);
+      and_operation = alignment_mask (and_insn);
+      if (and_operation != 0)
+	break;
+    }
+
+  return and_operation;
+}
+
+struct del_info { bool replace; rtx_insn *replace_insn; };
+
+/* If INSN is the load for an lvx pattern, put it in canonical form.  */
+static void
+recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete)
+{
+  rtx body = PATTERN (insn);
+  gcc_assert (GET_CODE (body) == SET
+	      && GET_CODE (SET_SRC (body)) == VEC_SELECT
+	      && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM);
+
+  rtx mem = XEXP (SET_SRC (body), 0);
+  rtx base_reg = XEXP (mem, 0);
+
+  rtx and_operation = find_alignment_op (insn, base_reg);
+
+  if (and_operation != 0)
+    {
+      df_ref def;
+      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+      FOR_EACH_INSN_INFO_DEF (def, insn_info)
+	{
+	  struct df_link *link = DF_REF_CHAIN (def);
+	  if (!link || link->next)
+	    break;
+
+	  rtx_insn *swap_insn = DF_REF_INSN (link->ref);
+	  if (!insn_is_swap_p (swap_insn)
+	      || insn_is_load_p (swap_insn)
+	      || insn_is_store_p (swap_insn))
+	    break;
+
+	  /* Expected lvx pattern found.  Change the swap to
+	     a copy, and propagate the AND operation into the
+	     load.  */
+	  to_delete[INSN_UID (swap_insn)].replace = true;
+	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
+
+	  XEXP (mem, 0) = and_operation;
+	  SET_SRC (body) = mem;
+	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+	  df_insn_rescan (insn);
+		  
+	  if (dump_file)
+	    fprintf (dump_file, "lvx opportunity found at %d\n",
+		     INSN_UID (insn));
+	}
+    }
+}
+
+/* If INSN is the store for an stvx pattern, put it in canonical form.  */
+static void
+recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete)
+{
+  rtx body = PATTERN (insn);
+  gcc_assert (GET_CODE (body) == SET
+	      && GET_CODE (SET_DEST (body)) == MEM
+	      && GET_CODE (SET_SRC (body)) == VEC_SELECT);
+  rtx mem = SET_DEST (body);
+  rtx base_reg = XEXP (mem, 0);
+
+  rtx and_operation = find_alignment_op (insn, base_reg);
+
+  if (and_operation != 0)
+    {
+      rtx src_reg = XEXP (SET_SRC (body), 0);
+      df_ref src_use;
+      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+      FOR_EACH_INSN_INFO_USE (src_use, insn_info)
+	{
+	  if (!rtx_equal_p (DF_REF_REG (src_use), src_reg))
+	    continue;
+
+	  struct df_link *link = DF_REF_CHAIN (src_use);
+	  if (!link || link->next)
+	    break;
+
+	  rtx_insn *swap_insn = DF_REF_INSN (link->ref);
+	  if (!insn_is_swap_p (swap_insn)
+	      || insn_is_load_p (swap_insn)
+	      || insn_is_store_p (swap_insn))
+	    break;
+
+	  /* Expected stvx pattern found.  Change the swap to
+	     a copy, and propagate the AND operation into the
+	     store.  */
+	  to_delete[INSN_UID (swap_insn)].replace = true;
+	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
+
+	  XEXP (mem, 0) = and_operation;
+	  SET_SRC (body) = src_reg;
+	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+	  df_insn_rescan (insn);
+		  
+	  if (dump_file)
+	    fprintf (dump_file, "stvx opportunity found at %d\n",
+		     INSN_UID (insn));
+	}
+    }
+}
+
+/* Look for patterns created from builtin lvx and stvx calls, and
+   canonicalize them to be properly recognized as such.  */
+static void
+recombine_lvx_stvx_patterns (function *fun)
+{
+  int i;
+  basic_block bb;
+  rtx_insn *insn;
+
+  int num_insns = get_max_uid ();
+  del_info *to_delete = XCNEWVEC (del_info, num_insns);
+
+  FOR_ALL_BB_FN (bb, fun)
+    FOR_BB_INSNS (bb, insn)
+    {
+      if (!NONDEBUG_INSN_P (insn))
+	continue;
+
+      if (insn_is_load_p (insn) && insn_is_swap_p (insn))
+	recombine_lvx_pattern (insn, to_delete);
+      else if (insn_is_store_p (insn) && insn_is_swap_p (insn))
+	recombine_stvx_pattern (insn, to_delete);
+    }
+
+  /* Turning swaps into copies is delayed until now, to avoid problems
+     with deleting instructions during the insn walk.  */
+  for (i = 0; i < num_insns; i++)
+    if (to_delete[i].replace)
+      {
+	rtx swap_body = PATTERN (to_delete[i].replace_insn);
+	rtx src_reg = XEXP (SET_SRC (swap_body), 0);
+	rtx copy = gen_rtx_SET (SET_DEST (swap_body), src_reg);
+	rtx_insn *new_insn = emit_insn_before (copy,
+					       to_delete[i].replace_insn);
+	set_block_for_insn (new_insn,
+			    BLOCK_FOR_INSN (to_delete[i].replace_insn));
+	df_insn_rescan (new_insn);
+	df_insn_delete (to_delete[i].replace_insn);
+	remove_insn (to_delete[i].replace_insn);
+	to_delete[i].replace_insn->set_deleted ();
+      }
+  
+  free (to_delete);
+}
+
+/* Main entry point for this pass.  */
+unsigned int
+rs6000_analyze_swaps (function *fun)
+{
+  swap_web_entry *insn_entry;
+  basic_block bb;
+  rtx_insn *insn, *curr_insn = 0;
+
+  /* Dataflow analysis for use-def chains.  */
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  /* Pre-pass to recombine lvx and stvx patterns so we don't lose info.  */
+  recombine_lvx_stvx_patterns (fun);
+
+  /* Allocate structure to represent webs of insns.  */
+  insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
+
+  /* Walk the insns to gather basic data.  */
+  FOR_ALL_BB_FN (bb, fun)
+    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
+    {
+      unsigned int uid = INSN_UID (insn);
+      if (NONDEBUG_INSN_P (insn))
+	{
+	  insn_entry[uid].insn = insn;
+
+	  if (GET_CODE (insn) == CALL_INSN)
+	    insn_entry[uid].is_call = 1;
+
+	  /* Walk the uses and defs to see if we mention vector regs.
+	     Record any constraints on optimization of such mentions.  */
+	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+	  df_ref mention;
+	  FOR_EACH_INSN_INFO_USE (mention, insn_info)
+	    {
+	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
+	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
+
+	      /* If a use gets its value from a call insn, it will be
+		 a hard register and will look like (reg:V4SI 3 3).
+		 The df analysis creates two mentions for GPR3 and GPR4,
+		 both DImode.  We must recognize this and treat it as a
+		 vector mention to ensure the call is unioned with this
+		 use.  */
+	      if (mode == DImode && DF_REF_INSN_INFO (mention))
+		{
+		  rtx feeder = DF_REF_INSN (mention);
+		  /* FIXME:  It is pretty hard to get from the df mention
+		     to the mode of the use in the insn.  We arbitrarily
+		     pick a vector mode here, even though the use might
+		     be a real DImode.  We can be too conservative
+		     (create a web larger than necessary) because of
+		     this, so consider eventually fixing this.  */
+		  if (GET_CODE (feeder) == CALL_INSN)
+		    mode = V4SImode;
+		}
+
+	      if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
+		{
+		  insn_entry[uid].is_relevant = 1;
+		  if (mode == TImode || mode == V1TImode
+		      || FLOAT128_VECTOR_P (mode))
+		    insn_entry[uid].is_128_int = 1;
+		  if (DF_REF_INSN_INFO (mention))
+		    insn_entry[uid].contains_subreg
+		      = !rtx_equal_p (DF_REF_REG (mention),
+				      DF_REF_REAL_REG (mention));
+		  union_defs (insn_entry, insn, mention);
+		}
+	    }
+	  FOR_EACH_INSN_INFO_DEF (mention, insn_info)
+	    {
+	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
+	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
+
+	      /* If we're loading up a hard vector register for a call,
+		 it looks like (set (reg:V4SI 9 9) (...)).  The df
+		 analysis creates two mentions for GPR9 and GPR10, both
+		 DImode.  So relying on the mode from the mentions
+		 isn't sufficient to ensure we union the call into the
+		 web with the parameter setup code.  */
+	      if (mode == DImode && GET_CODE (insn) == SET
+		  && ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (SET_DEST (insn))))
+		mode = GET_MODE (SET_DEST (insn));
+
+	      if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
+		{
+		  insn_entry[uid].is_relevant = 1;
+		  if (mode == TImode || mode == V1TImode
+		      || FLOAT128_VECTOR_P (mode))
+		    insn_entry[uid].is_128_int = 1;
+		  if (DF_REF_INSN_INFO (mention))
+		    insn_entry[uid].contains_subreg
+		      = !rtx_equal_p (DF_REF_REG (mention),
+				      DF_REF_REAL_REG (mention));
+		  /* REG_FUNCTION_VALUE_P is not valid for subregs. */
+		  else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
+		    insn_entry[uid].is_live_out = 1;
+		  union_uses (insn_entry, insn, mention);
+		}
+	    }
+
+	  if (insn_entry[uid].is_relevant)
+	    {
+	      /* Determine if this is a load or store.  */
+	      insn_entry[uid].is_load = insn_is_load_p (insn);
+	      insn_entry[uid].is_store = insn_is_store_p (insn);
+
+	      /* Determine if this is a doubleword swap.  If not,
+		 determine whether it can legally be swapped.  */
+	      if (insn_is_swap_p (insn))
+		insn_entry[uid].is_swap = 1;
+	      else
+		{
+		  unsigned int special = SH_NONE;
+		  insn_entry[uid].is_swappable
+		    = insn_is_swappable_p (insn_entry, insn, &special);
+		  if (special != SH_NONE && insn_entry[uid].contains_subreg)
+		    insn_entry[uid].is_swappable = 0;
+		  else if (special != SH_NONE)
+		    insn_entry[uid].special_handling = special;
+		  else if (insn_entry[uid].contains_subreg)
+		    insn_entry[uid].special_handling = SH_SUBREG;
+		}
+	    }
+	}
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\nSwap insn entry table when first built\n");
+      dump_swap_insn_table (insn_entry);
+    }
+
+  /* Record unoptimizable webs.  */
+  unsigned e = get_max_uid (), i;
+  for (i = 0; i < e; ++i)
+    {
+      if (!insn_entry[i].is_relevant)
+	continue;
+
+      swap_web_entry *root
+	= (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
+
+      if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
+	  || (insn_entry[i].contains_subreg
+	      && insn_entry[i].special_handling != SH_SUBREG)
+	  || insn_entry[i].is_128_int || insn_entry[i].is_call
+	  || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
+	root->web_not_optimizable = 1;
+
+      /* If we have loads or stores that aren't permuting then the
+	 optimization isn't appropriate.  */
+      else if ((insn_entry[i].is_load || insn_entry[i].is_store)
+	  && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
+	root->web_not_optimizable = 1;
+
+      /* If we have permuting loads or stores that are not accompanied
+	 by a register swap, the optimization isn't appropriate.  */
+      else if (insn_entry[i].is_load && insn_entry[i].is_swap)
+	{
+	  rtx insn = insn_entry[i].insn;
+	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+	  df_ref def;
+
+	  FOR_EACH_INSN_INFO_DEF (def, insn_info)
+	    {
+	      struct df_link *link = DF_REF_CHAIN (def);
+
+	      if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
+		{
+		  root->web_not_optimizable = 1;
+		  break;
+		}
+	    }
+	}
+      else if (insn_entry[i].is_store && insn_entry[i].is_swap)
+	{
+	  rtx insn = insn_entry[i].insn;
+	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+	  df_ref use;
+
+	  FOR_EACH_INSN_INFO_USE (use, insn_info)
+	    {
+	      struct df_link *link = DF_REF_CHAIN (use);
+
+	      if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
+		{
+		  root->web_not_optimizable = 1;
+		  break;
+		}
+	    }
+	}
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
+      dump_swap_insn_table (insn_entry);
+    }
+
+  /* For each load and store in an optimizable web (which implies
+     the loads and stores are permuting), find the associated
+     register swaps and mark them for removal.  Due to various
+     optimizations we may mark the same swap more than once.  Also
+     perform special handling for swappable insns that require it.  */
+  for (i = 0; i < e; ++i)
+    if ((insn_entry[i].is_load || insn_entry[i].is_store)
+	&& insn_entry[i].is_swap)
+      {
+	swap_web_entry* root_entry
+	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
+	if (!root_entry->web_not_optimizable)
+	  mark_swaps_for_removal (insn_entry, i);
+      }
+    else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
+      {
+	swap_web_entry* root_entry
+	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
+	if (!root_entry->web_not_optimizable)
+	  handle_special_swappables (insn_entry, i);
+      }
+
+  /* Now delete the swaps marked for removal.  */
+  for (i = 0; i < e; ++i)
+    if (insn_entry[i].will_delete)
+      replace_swap_with_copy (insn_entry, i);
+
+  /* Clean up.  */
+  free (insn_entry);
+  return 0;
+}
+
+const pass_data pass_data_analyze_swaps =
+{
+  RTL_PASS, /* type */
+  "swaps", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_analyze_swaps : public rtl_opt_pass
+{
+public:
+  pass_analyze_swaps(gcc::context *ctxt)
+    : rtl_opt_pass(pass_data_analyze_swaps, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
+	      && !TARGET_P9_VECTOR && rs6000_optimize_swaps);
+    }
+
+  virtual unsigned int execute (function *fun)
+    {
+      return rs6000_analyze_swaps (fun);
+    }
+
+  opt_pass *clone ()
+    {
+      return new pass_analyze_swaps (m_ctxt);
+    }
+
+}; // class pass_analyze_swaps
+
+rtl_opt_pass *
+make_pass_analyze_swaps (gcc::context *ctxt)
+{
+  return new pass_analyze_swaps (ctxt);
+}
+
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 250005)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -39026,1862 +39026,6 @@ 
   return "";
 }
 
-
-/* Analyze vector computations and remove unnecessary doubleword
-   swaps (xxswapdi instructions).  This pass is performed only
-   for little-endian VSX code generation.
-
-   For this specific case, loads and stores of 4x32 and 2x64 vectors
-   are inefficient.  These are implemented using the lvx2dx and
-   stvx2dx instructions, which invert the order of doublewords in
-   a vector register.  Thus the code generation inserts an xxswapdi
-   after each such load, and prior to each such store.  (For spill
-   code after register assignment, an additional xxswapdi is inserted
-   following each store in order to return a hard register to its
-   unpermuted value.)
-
-   The extra xxswapdi instructions reduce performance.  This can be
-   particularly bad for vectorized code.  The purpose of this pass
-   is to reduce the number of xxswapdi instructions required for
-   correctness.
-
-   The primary insight is that much code that operates on vectors
-   does not care about the relative order of elements in a register,
-   so long as the correct memory order is preserved.  If we have
-   a computation where all input values are provided by lvxd2x/xxswapdi
-   sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
-   and all intermediate computations are pure SIMD (independent of
-   element order), then all the xxswapdi's associated with the loads
-   and stores may be removed.
-
-   This pass uses some of the infrastructure and logical ideas from
-   the "web" pass in web.c.  We create maximal webs of computations
-   fitting the description above using union-find.  Each such web is
-   then optimized by removing its unnecessary xxswapdi instructions.
-
-   The pass is placed prior to global optimization so that we can
-   perform the optimization in the safest and simplest way possible;
-   that is, by replacing each xxswapdi insn with a register copy insn.
-   Subsequent forward propagation will remove copies where possible.
-
-   There are some operations sensitive to element order for which we
-   can still allow the operation, provided we modify those operations.
-   These include CONST_VECTORs, for which we must swap the first and
-   second halves of the constant vector; and SUBREGs, for which we
-   must adjust the byte offset to account for the swapped doublewords.
-   A remaining opportunity would be non-immediate-form splats, for
-   which we should adjust the selected lane of the input.  We should
-   also make code generation adjustments for sum-across operations,
-   since this is a common vectorizer reduction.
-
-   Because we run prior to the first split, we can see loads and stores
-   here that match *vsx_le_perm_{load,store}_<mode>.  These are vanilla
-   vector loads and stores that have not yet been split into a permuting
-   load/store and a swap.  (One way this can happen is with a builtin
-   call to vec_vsx_{ld,st}.)  We can handle these as well, but rather
-   than deleting a swap, we convert the load/store into a permuting
-   load/store (which effectively removes the swap).  */
-
-/* Notes on Permutes
-
-   We do not currently handle computations that contain permutes.  There
-   is a general transformation that can be performed correctly, but it
-   may introduce more expensive code than it replaces.  To handle these
-   would require a cost model to determine when to perform the optimization.
-   This commentary records how this could be done if desired.
-
-   The most general permute is something like this (example for V16QI):
-
-   (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
-                     (parallel [(const_int a0) (const_int a1)
-                                 ...
-                                (const_int a14) (const_int a15)]))
-
-   where a0,...,a15 are in [0,31] and select elements from op1 and op2
-   to produce in the result.
-
-   Regardless of mode, we can convert the PARALLEL to a mask of 16
-   byte-element selectors.  Let's call this M, with M[i] representing
-   the ith byte-element selector value.  Then if we swap doublewords
-   throughout the computation, we can get correct behavior by replacing
-   M with M' as follows:
-
-    M'[i] = { (M[i]+8)%16      : M[i] in [0,15]
-            { ((M[i]+8)%16)+16 : M[i] in [16,31]
-
-   This seems promising at first, since we are just replacing one mask
-   with another.  But certain masks are preferable to others.  If M
-   is a mask that matches a vmrghh pattern, for example, M' certainly
-   will not.  Instead of a single vmrghh, we would generate a load of
-   M' and a vperm.  So we would need to know how many xxswapd's we can
-   remove as a result of this transformation to determine if it's
-   profitable; and preferably the logic would need to be aware of all
-   the special preferable masks.
-
-   Another form of permute is an UNSPEC_VPERM, in which the mask is
-   already in a register.  In some cases, this mask may be a constant
-   that we can discover with ud-chains, in which case the above
-   transformation is ok.  However, the common usage here is for the
-   mask to be produced by an UNSPEC_LVSL, in which case the mask 
-   cannot be known at compile time.  In such a case we would have to
-   generate several instructions to compute M' as above at run time,
-   and a cost model is needed again.
-
-   However, when the mask M for an UNSPEC_VPERM is loaded from the
-   constant pool, we can replace M with M' as above at no cost
-   beyond adding a constant pool entry.  */
-
-/* This is based on the union-find logic in web.c.  web_entry_base is
-   defined in df.h.  */
-class swap_web_entry : public web_entry_base
-{
- public:
-  /* Pointer to the insn.  */
-  rtx_insn *insn;
-  /* Set if insn contains a mention of a vector register.  All other
-     fields are undefined if this field is unset.  */
-  unsigned int is_relevant : 1;
-  /* Set if insn is a load.  */
-  unsigned int is_load : 1;
-  /* Set if insn is a store.  */
-  unsigned int is_store : 1;
-  /* Set if insn is a doubleword swap.  This can either be a register swap
-     or a permuting load or store (test is_load and is_store for this).  */
-  unsigned int is_swap : 1;
-  /* Set if the insn has a live-in use of a parameter register.  */
-  unsigned int is_live_in : 1;
-  /* Set if the insn has a live-out def of a return register.  */
-  unsigned int is_live_out : 1;
-  /* Set if the insn contains a subreg reference of a vector register.  */
-  unsigned int contains_subreg : 1;
-  /* Set if the insn contains a 128-bit integer operand.  */
-  unsigned int is_128_int : 1;
-  /* Set if this is a call-insn.  */
-  unsigned int is_call : 1;
-  /* Set if this insn does not perform a vector operation for which
-     element order matters, or if we know how to fix it up if it does.
-     Undefined if is_swap is set.  */
-  unsigned int is_swappable : 1;
-  /* A nonzero value indicates what kind of special handling for this
-     insn is required if doublewords are swapped.  Undefined if
-     is_swappable is not set.  */
-  unsigned int special_handling : 4;
-  /* Set if the web represented by this entry cannot be optimized.  */
-  unsigned int web_not_optimizable : 1;
-  /* Set if this insn should be deleted.  */
-  unsigned int will_delete : 1;
-};
-
-enum special_handling_values {
-  SH_NONE = 0,
-  SH_CONST_VECTOR,
-  SH_SUBREG,
-  SH_NOSWAP_LD,
-  SH_NOSWAP_ST,
-  SH_EXTRACT,
-  SH_SPLAT,
-  SH_XXPERMDI,
-  SH_CONCAT,
-  SH_VPERM
-};
-
-/* Union INSN with all insns containing definitions that reach USE.
-   Detect whether USE is live-in to the current function.  */
-static void
-union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
-{
-  struct df_link *link = DF_REF_CHAIN (use);
-
-  if (!link)
-    insn_entry[INSN_UID (insn)].is_live_in = 1;
-
-  while (link)
-    {
-      if (DF_REF_IS_ARTIFICIAL (link->ref))
-	insn_entry[INSN_UID (insn)].is_live_in = 1;
-
-      if (DF_REF_INSN_INFO (link->ref))
-	{
-	  rtx def_insn = DF_REF_INSN (link->ref);
-	  (void)unionfind_union (insn_entry + INSN_UID (insn),
-				 insn_entry + INSN_UID (def_insn));
-	}
-
-      link = link->next;
-    }
-}
-
-/* Union INSN with all insns containing uses reached from DEF.
-   Detect whether DEF is live-out from the current function.  */
-static void
-union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
-{
-  struct df_link *link = DF_REF_CHAIN (def);
-
-  if (!link)
-    insn_entry[INSN_UID (insn)].is_live_out = 1;
-
-  while (link)
-    {
-      /* This could be an eh use or some other artificial use;
-	 we treat these all the same (killing the optimization).  */
-      if (DF_REF_IS_ARTIFICIAL (link->ref))
-	insn_entry[INSN_UID (insn)].is_live_out = 1;
-
-      if (DF_REF_INSN_INFO (link->ref))
-	{
-	  rtx use_insn = DF_REF_INSN (link->ref);
-	  (void)unionfind_union (insn_entry + INSN_UID (insn),
-				 insn_entry + INSN_UID (use_insn));
-	}
-
-      link = link->next;
-    }
-}
-
-/* Return 1 iff INSN is a load insn, including permuting loads that
-   represent an lvxd2x instruction; else return 0.  */
-static unsigned int
-insn_is_load_p (rtx insn)
-{
-  rtx body = PATTERN (insn);
-
-  if (GET_CODE (body) == SET)
-    {
-      if (GET_CODE (SET_SRC (body)) == MEM)
-	return 1;
-
-      if (GET_CODE (SET_SRC (body)) == VEC_SELECT
-	  && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM)
-	return 1;
-
-      return 0;
-    }
-
-  if (GET_CODE (body) != PARALLEL)
-    return 0;
-
-  rtx set = XVECEXP (body, 0, 0);
-
-  if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM)
-    return 1;
-
-  return 0;
-}
-
-/* Return 1 iff INSN is a store insn, including permuting stores that
-   represent an stvxd2x instruction; else return 0.  */
-static unsigned int
-insn_is_store_p (rtx insn)
-{
-  rtx body = PATTERN (insn);
-  if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM)
-    return 1;
-  if (GET_CODE (body) != PARALLEL)
-    return 0;
-  rtx set = XVECEXP (body, 0, 0);
-  if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM)
-    return 1;
-  return 0;
-}
-
-/* Return 1 iff INSN swaps doublewords.  This may be a reg-reg swap,
-   a permuting load, or a permuting store.  */
-static unsigned int
-insn_is_swap_p (rtx insn)
-{
-  rtx body = PATTERN (insn);
-  if (GET_CODE (body) != SET)
-    return 0;
-  rtx rhs = SET_SRC (body);
-  if (GET_CODE (rhs) != VEC_SELECT)
-    return 0;
-  rtx parallel = XEXP (rhs, 1);
-  if (GET_CODE (parallel) != PARALLEL)
-    return 0;
-  unsigned int len = XVECLEN (parallel, 0);
-  if (len != 2 && len != 4 && len != 8 && len != 16)
-    return 0;
-  for (unsigned int i = 0; i < len / 2; ++i)
-    {
-      rtx op = XVECEXP (parallel, 0, i);
-      if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
-	return 0;
-    }
-  for (unsigned int i = len / 2; i < len; ++i)
-    {
-      rtx op = XVECEXP (parallel, 0, i);
-      if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
-	return 0;
-    }
-  return 1;
-}
-
-/* Return TRUE if insn is a swap fed by a load from the constant pool.  */
-static bool
-const_load_sequence_p (swap_web_entry *insn_entry, rtx insn)
-{
-  unsigned uid = INSN_UID (insn);
-  if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
-    return false;
-
-  const_rtx tocrel_base;
-
-  /* Find the unique use in the swap and locate its def.  If the def
-     isn't unique, punt.  */
-  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-  df_ref use;
-  FOR_EACH_INSN_INFO_USE (use, insn_info)
-    {
-      struct df_link *def_link = DF_REF_CHAIN (use);
-      if (!def_link || def_link->next)
-	return false;
-
-      rtx def_insn = DF_REF_INSN (def_link->ref);
-      unsigned uid2 = INSN_UID (def_insn);
-      if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
-	return false;
-
-      rtx body = PATTERN (def_insn);
-      if (GET_CODE (body) != SET
-	  || GET_CODE (SET_SRC (body)) != VEC_SELECT
-	  || GET_CODE (XEXP (SET_SRC (body), 0)) != MEM)
-	return false;
-
-      rtx mem = XEXP (SET_SRC (body), 0);
-      rtx base_reg = XEXP (mem, 0);
-
-      df_ref base_use;
-      insn_info = DF_INSN_INFO_GET (def_insn);
-      FOR_EACH_INSN_INFO_USE (base_use, insn_info)
-	{
-	  if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
-	    continue;
-
-	  struct df_link *base_def_link = DF_REF_CHAIN (base_use);
-	  if (!base_def_link || base_def_link->next)
-	    return false;
-
-	  rtx tocrel_insn = DF_REF_INSN (base_def_link->ref);
-	  rtx tocrel_body = PATTERN (tocrel_insn);
-	  rtx base, offset;
-	  if (GET_CODE (tocrel_body) != SET)
-	    return false;
-	  /* There is an extra level of indirection for small/large
-	     code models.  */
-	  rtx tocrel_expr = SET_SRC (tocrel_body);
-	  if (GET_CODE (tocrel_expr) == MEM)
-	    tocrel_expr = XEXP (tocrel_expr, 0);
-	  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
-	    return false;
-	  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
-	  if (GET_CODE (base) != SYMBOL_REF || !CONSTANT_POOL_ADDRESS_P (base))
-	    return false;
-	}
-    }
-  return true;
-}
-
-/* Return TRUE iff OP matches a V2DF reduction pattern.  See the
-   definition of vsx_reduc_<VEC_reduc_name>_v2df in vsx.md.  */
-static bool
-v2df_reduction_p (rtx op)
-{
-  if (GET_MODE (op) != V2DFmode)
-    return false;
-  
-  enum rtx_code code = GET_CODE (op);
-  if (code != PLUS && code != SMIN && code != SMAX)
-    return false;
-
-  rtx concat = XEXP (op, 0);
-  if (GET_CODE (concat) != VEC_CONCAT)
-    return false;
-
-  rtx select0 = XEXP (concat, 0);
-  rtx select1 = XEXP (concat, 1);
-  if (GET_CODE (select0) != VEC_SELECT || GET_CODE (select1) != VEC_SELECT)
-    return false;
-
-  rtx reg0 = XEXP (select0, 0);
-  rtx reg1 = XEXP (select1, 0);
-  if (!rtx_equal_p (reg0, reg1) || !REG_P (reg0))
-    return false;
-
-  rtx parallel0 = XEXP (select0, 1);
-  rtx parallel1 = XEXP (select1, 1);
-  if (GET_CODE (parallel0) != PARALLEL || GET_CODE (parallel1) != PARALLEL)
-    return false;
-
-  if (!rtx_equal_p (XVECEXP (parallel0, 0, 0), const1_rtx)
-      || !rtx_equal_p (XVECEXP (parallel1, 0, 0), const0_rtx))
-    return false;
-
-  return true;
-}
-
-/* Return 1 iff OP is an operand that will not be affected by having
-   vector doublewords swapped in memory.  */
-static unsigned int
-rtx_is_swappable_p (rtx op, unsigned int *special)
-{
-  enum rtx_code code = GET_CODE (op);
-  int i, j;
-  rtx parallel;
-
-  switch (code)
-    {
-    case LABEL_REF:
-    case SYMBOL_REF:
-    case CLOBBER:
-    case REG:
-      return 1;
-
-    case VEC_CONCAT:
-    case ASM_INPUT:
-    case ASM_OPERANDS:
-      return 0;
-
-    case CONST_VECTOR:
-      {
-	*special = SH_CONST_VECTOR;
-	return 1;
-      }
-
-    case VEC_DUPLICATE:
-      /* Opportunity: If XEXP (op, 0) has the same mode as the result,
-	 and XEXP (op, 1) is a PARALLEL with a single QImode const int,
-	 it represents a vector splat for which we can do special
-	 handling.  */
-      if (GET_CODE (XEXP (op, 0)) == CONST_INT)
-	return 1;
-      else if (REG_P (XEXP (op, 0))
-	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
-	/* This catches V2DF and V2DI splat, at a minimum.  */
-	return 1;
-      else if (GET_CODE (XEXP (op, 0)) == TRUNCATE
-	       && REG_P (XEXP (XEXP (op, 0), 0))
-	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
-	/* This catches splat of a truncated value.  */
-	return 1;
-      else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
-	/* If the duplicated item is from a select, defer to the select
-	   processing to see if we can change the lane for the splat.  */
-	return rtx_is_swappable_p (XEXP (op, 0), special);
-      else
-	return 0;
-
-    case VEC_SELECT:
-      /* A vec_extract operation is ok if we change the lane.  */
-      if (GET_CODE (XEXP (op, 0)) == REG
-	  && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
-	  && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
-	  && XVECLEN (parallel, 0) == 1
-	  && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
-	{
-	  *special = SH_EXTRACT;
-	  return 1;
-	}
-      /* An XXPERMDI is ok if we adjust the lanes.  Note that if the
-	 XXPERMDI is a swap operation, it will be identified by
-	 insn_is_swap_p and therefore we won't get here.  */
-      else if (GET_CODE (XEXP (op, 0)) == VEC_CONCAT
-	       && (GET_MODE (XEXP (op, 0)) == V4DFmode
-		   || GET_MODE (XEXP (op, 0)) == V4DImode)
-	       && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
-	       && XVECLEN (parallel, 0) == 2
-	       && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT
-	       && GET_CODE (XVECEXP (parallel, 0, 1)) == CONST_INT)
-	{
-	  *special = SH_XXPERMDI;
-	  return 1;
-	}
-      else if (v2df_reduction_p (op))
-	return 1;
-      else
-	return 0;
-
-    case UNSPEC:
-      {
-	/* Various operations are unsafe for this optimization, at least
-	   without significant additional work.  Permutes are obviously
-	   problematic, as both the permute control vector and the ordering
-	   of the target values are invalidated by doubleword swapping.
-	   Vector pack and unpack modify the number of vector lanes.
-	   Merge-high/low will not operate correctly on swapped operands.
-	   Vector shifts across element boundaries are clearly uncool,
-	   as are vector select and concatenate operations.  Vector
-	   sum-across instructions define one operand with a specific
-	   order-dependent element, so additional fixup code would be
-	   needed to make those work.  Vector set and non-immediate-form
-	   vector splat are element-order sensitive.  A few of these
-	   cases might be workable with special handling if required.
-	   Adding cost modeling would be appropriate in some cases.  */
-	int val = XINT (op, 1);
-	switch (val)
-	  {
-	  default:
-	    break;
-	  case UNSPEC_VMRGH_DIRECT:
-	  case UNSPEC_VMRGL_DIRECT:
-	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
-	  case UNSPEC_VPACK_SIGN_UNS_SAT:
-	  case UNSPEC_VPACK_UNS_UNS_MOD:
-	  case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
-	  case UNSPEC_VPACK_UNS_UNS_SAT:
-	  case UNSPEC_VPERM:
-	  case UNSPEC_VPERM_UNS:
-	  case UNSPEC_VPERMHI:
-	  case UNSPEC_VPERMSI:
-	  case UNSPEC_VPKPX:
-	  case UNSPEC_VSLDOI:
-	  case UNSPEC_VSLO:
-	  case UNSPEC_VSRO:
-	  case UNSPEC_VSUM2SWS:
-	  case UNSPEC_VSUM4S:
-	  case UNSPEC_VSUM4UBS:
-	  case UNSPEC_VSUMSWS:
-	  case UNSPEC_VSUMSWS_DIRECT:
-	  case UNSPEC_VSX_CONCAT:
-	  case UNSPEC_VSX_SET:
-	  case UNSPEC_VSX_SLDWI:
-	  case UNSPEC_VUNPACK_HI_SIGN:
-	  case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
-	  case UNSPEC_VUNPACK_LO_SIGN:
-	  case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
-	  case UNSPEC_VUPKHPX:
-	  case UNSPEC_VUPKHS_V4SF:
-	  case UNSPEC_VUPKHU_V4SF:
-	  case UNSPEC_VUPKLPX:
-	  case UNSPEC_VUPKLS_V4SF:
-	  case UNSPEC_VUPKLU_V4SF:
-	  case UNSPEC_VSX_CVDPSPN:
-	  case UNSPEC_VSX_CVSPDP:
-	  case UNSPEC_VSX_CVSPDPN:
-	  case UNSPEC_VSX_EXTRACT:
-	  case UNSPEC_VSX_VSLO:
-	  case UNSPEC_VSX_VEC_INIT:
-	    return 0;
-	  case UNSPEC_VSPLT_DIRECT:
-	  case UNSPEC_VSX_XXSPLTD:
-	    *special = SH_SPLAT;
-	    return 1;
-	  case UNSPEC_REDUC_PLUS:
-	  case UNSPEC_REDUC:
-	    return 1;
-	  }
-      }
-
-    default:
-      break;
-    }
-
-  const char *fmt = GET_RTX_FORMAT (code);
-  int ok = 1;
-
-  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
-    if (fmt[i] == 'e' || fmt[i] == 'u')
-      {
-	unsigned int special_op = SH_NONE;
-	ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
-	if (special_op == SH_NONE)
-	  continue;
-	/* Ensure we never have two kinds of special handling
-	   for the same insn.  */
-	if (*special != SH_NONE && *special != special_op)
-	  return 0;
-	*special = special_op;
-      }
-    else if (fmt[i] == 'E')
-      for (j = 0; j < XVECLEN (op, i); ++j)
-	{
-	  unsigned int special_op = SH_NONE;
-	  ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
-	if (special_op == SH_NONE)
-	  continue;
-	  /* Ensure we never have two kinds of special handling
-	     for the same insn.  */
-	  if (*special != SH_NONE && *special != special_op)
-	    return 0;
-	  *special = special_op;
-	}
-
-  return ok;
-}
-
-/* Return 1 iff INSN is an operand that will not be affected by
-   having vector doublewords swapped in memory (in which case
-   *SPECIAL is unchanged), or that can be modified to be correct
-   if vector doublewords are swapped in memory (in which case
-   *SPECIAL is changed to a value indicating how).  */
-static unsigned int
-insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
-		     unsigned int *special)
-{
-  /* Calls are always bad.  */
-  if (GET_CODE (insn) == CALL_INSN)
-    return 0;
-
-  /* Loads and stores seen here are not permuting, but we can still
-     fix them up by converting them to permuting ones.  Exceptions:
-     UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
-     body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
-     for the SET source.  Also we must now make an exception for lvx
-     and stvx when they are not in the UNSPEC_LVX/STVX form (with the
-     explicit "& -16") since this leads to unrecognizable insns.  */
-  rtx body = PATTERN (insn);
-  int i = INSN_UID (insn);
-
-  if (insn_entry[i].is_load)
-    {
-      if (GET_CODE (body) == SET)
-	{
-	  rtx rhs = SET_SRC (body);
-	  /* Even without a swap, the RHS might be a vec_select for, say,
-	     a byte-reversing load.  */
-	  if (GET_CODE (rhs) != MEM)
-	    return 0;
-	  if (GET_CODE (XEXP (rhs, 0)) == AND)
-	    return 0;
-
-	  *special = SH_NOSWAP_LD;
-	  return 1;
-	}
-      else
-	return 0;
-    }
-
-  if (insn_entry[i].is_store)
-    {
-      if (GET_CODE (body) == SET
-	  && GET_CODE (SET_SRC (body)) != UNSPEC)
-	{
-	  rtx lhs = SET_DEST (body);
-	  /* Even without a swap, the LHS might be a vec_select for, say,
-	     a byte-reversing store.  */
-	  if (GET_CODE (lhs) != MEM)
-	    return 0;
-	  if (GET_CODE (XEXP (lhs, 0)) == AND)
-	    return 0;
-	  
-	  *special = SH_NOSWAP_ST;
-	  return 1;
-	}
-      else
-	return 0;
-    }
-
-  /* A convert to single precision can be left as is provided that
-     all of its uses are in xxspltw instructions that splat BE element
-     zero.  */
-  if (GET_CODE (body) == SET
-      && GET_CODE (SET_SRC (body)) == UNSPEC
-      && XINT (SET_SRC (body), 1) == UNSPEC_VSX_CVDPSPN)
-    {
-      df_ref def;
-      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-
-      FOR_EACH_INSN_INFO_DEF (def, insn_info)
-	{
-	  struct df_link *link = DF_REF_CHAIN (def);
-	  if (!link)
-	    return 0;
-
-	  for (; link; link = link->next) {
-	    rtx use_insn = DF_REF_INSN (link->ref);
-	    rtx use_body = PATTERN (use_insn);
-	    if (GET_CODE (use_body) != SET
-		|| GET_CODE (SET_SRC (use_body)) != UNSPEC
-		|| XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW
-		|| XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx)
-	      return 0;
-	  }
-	}
-
-      return 1;
-    }
-
-  /* A concatenation of two doublewords is ok if we reverse the
-     order of the inputs.  */
-  if (GET_CODE (body) == SET
-      && GET_CODE (SET_SRC (body)) == VEC_CONCAT
-      && (GET_MODE (SET_SRC (body)) == V2DFmode
-	  || GET_MODE (SET_SRC (body)) == V2DImode))
-    {
-      *special = SH_CONCAT;
-      return 1;
-    }
-
-  /* V2DF reductions are always swappable.  */
-  if (GET_CODE (body) == PARALLEL)
-    {
-      rtx expr = XVECEXP (body, 0, 0);
-      if (GET_CODE (expr) == SET
-	  && v2df_reduction_p (SET_SRC (expr)))
-	return 1;
-    }
-
-  /* An UNSPEC_VPERM is ok if the mask operand is loaded from the
-     constant pool.  */
-  if (GET_CODE (body) == SET
-      && GET_CODE (SET_SRC (body)) == UNSPEC
-      && XINT (SET_SRC (body), 1) == UNSPEC_VPERM
-      && XVECLEN (SET_SRC (body), 0) == 3
-      && GET_CODE (XVECEXP (SET_SRC (body), 0, 2)) == REG)
-    {
-      rtx mask_reg = XVECEXP (SET_SRC (body), 0, 2);
-      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-      df_ref use;
-      FOR_EACH_INSN_INFO_USE (use, insn_info)
-	if (rtx_equal_p (DF_REF_REG (use), mask_reg))
-	  {
-	    struct df_link *def_link = DF_REF_CHAIN (use);
-	    /* Punt if multiple definitions for this reg.  */
-	    if (def_link && !def_link->next &&
-		const_load_sequence_p (insn_entry,
-				       DF_REF_INSN (def_link->ref)))
-	      {
-		*special = SH_VPERM;
-		return 1;
-	      }
-	  }
-    }
-
-  /* Otherwise check the operands for vector lane violations.  */
-  return rtx_is_swappable_p (body, special);
-}
-
-enum chain_purpose { FOR_LOADS, FOR_STORES };
-
-/* Return true if the UD or DU chain headed by LINK is non-empty,
-   and every entry on the chain references an insn that is a
-   register swap.  Furthermore, if PURPOSE is FOR_LOADS, each such
-   register swap must have only permuting loads as reaching defs.
-   If PURPOSE is FOR_STORES, each such register swap must have only
-   register swaps or permuting stores as reached uses.  */
-static bool
-chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
-			   enum chain_purpose purpose)
-{
-  if (!link)
-    return false;
-
-  for (; link; link = link->next)
-    {
-      if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (DF_REF_REG (link->ref))))
-	continue;
-
-      if (DF_REF_IS_ARTIFICIAL (link->ref))
-	return false;
-
-      rtx reached_insn = DF_REF_INSN (link->ref);
-      unsigned uid = INSN_UID (reached_insn);
-      struct df_insn_info *insn_info = DF_INSN_INFO_GET (reached_insn);
-
-      if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
-	  || insn_entry[uid].is_store)
-	return false;
-
-      if (purpose == FOR_LOADS)
-	{
-	  df_ref use;
-	  FOR_EACH_INSN_INFO_USE (use, insn_info)
-	    {
-	      struct df_link *swap_link = DF_REF_CHAIN (use);
-
-	      while (swap_link)
-		{
-		  if (DF_REF_IS_ARTIFICIAL (link->ref))
-		    return false;
-
-		  rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
-		  unsigned uid2 = INSN_UID (swap_def_insn);
-
-		  /* Only permuting loads are allowed.  */
-		  if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
-		    return false;
-
-		  swap_link = swap_link->next;
-		}
-	    }
-	}
-      else if (purpose == FOR_STORES)
-	{
-	  df_ref def;
-	  FOR_EACH_INSN_INFO_DEF (def, insn_info)
-	    {
-	      struct df_link *swap_link = DF_REF_CHAIN (def);
-
-	      while (swap_link)
-		{
-		  if (DF_REF_IS_ARTIFICIAL (link->ref))
-		    return false;
-
-		  rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
-		  unsigned uid2 = INSN_UID (swap_use_insn);
-
-		  /* Permuting stores or register swaps are allowed.  */
-		  if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
-		    return false;
-
-		  swap_link = swap_link->next;
-		}
-	    }
-	}
-    }
-
-  return true;
-}
-
-/* Mark the xxswapdi instructions associated with permuting loads and
-   stores for removal.  Note that we only flag them for deletion here,
-   as there is a possibility of a swap being reached from multiple
-   loads, etc.  */
-static void
-mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
-{
-  rtx insn = insn_entry[i].insn;
-  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-
-  if (insn_entry[i].is_load)
-    {
-      df_ref def;
-      FOR_EACH_INSN_INFO_DEF (def, insn_info)
-	{
-	  struct df_link *link = DF_REF_CHAIN (def);
-
-	  /* We know by now that these are swaps, so we can delete
-	     them confidently.  */
-	  while (link)
-	    {
-	      rtx use_insn = DF_REF_INSN (link->ref);
-	      insn_entry[INSN_UID (use_insn)].will_delete = 1;
-	      link = link->next;
-	    }
-	}
-    }
-  else if (insn_entry[i].is_store)
-    {
-      df_ref use;
-      FOR_EACH_INSN_INFO_USE (use, insn_info)
-	{
-	  /* Ignore uses for addressability.  */
-	  machine_mode mode = GET_MODE (DF_REF_REG (use));
-	  if (!ALTIVEC_OR_VSX_VECTOR_MODE (mode))
-	    continue;
-
-	  struct df_link *link = DF_REF_CHAIN (use);
-
-	  /* We know by now that these are swaps, so we can delete
-	     them confidently.  */
-	  while (link)
-	    {
-	      rtx def_insn = DF_REF_INSN (link->ref);
-	      insn_entry[INSN_UID (def_insn)].will_delete = 1;
-	      link = link->next;
-	    }
-	}
-    }
-}
-
-/* OP is either a CONST_VECTOR or an expression containing one.
-   Swap the first half of the vector with the second in the first
-   case.  Recurse to find it in the second.  */
-static void
-swap_const_vector_halves (rtx op)
-{
-  int i;
-  enum rtx_code code = GET_CODE (op);
-  if (GET_CODE (op) == CONST_VECTOR)
-    {
-      int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
-      for (i = 0; i < half_units; ++i)
-	{
-	  rtx temp = CONST_VECTOR_ELT (op, i);
-	  CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
-	  CONST_VECTOR_ELT (op, i + half_units) = temp;
-	}
-    }
-  else
-    {
-      int j;
-      const char *fmt = GET_RTX_FORMAT (code);
-      for (i = 0; i < GET_RTX_LENGTH (code); ++i)
-	if (fmt[i] == 'e' || fmt[i] == 'u')
-	  swap_const_vector_halves (XEXP (op, i));
-	else if (fmt[i] == 'E')
-	  for (j = 0; j < XVECLEN (op, i); ++j)
-	    swap_const_vector_halves (XVECEXP (op, i, j));
-    }
-}
-
-/* Find all subregs of a vector expression that perform a narrowing,
-   and adjust the subreg index to account for doubleword swapping.  */
-static void
-adjust_subreg_index (rtx op)
-{
-  enum rtx_code code = GET_CODE (op);
-  if (code == SUBREG
-      && (GET_MODE_SIZE (GET_MODE (op))
-	  < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
-    {
-      unsigned int index = SUBREG_BYTE (op);
-      if (index < 8)
-	index += 8;
-      else
-	index -= 8;
-      SUBREG_BYTE (op) = index;
-    }
-
-  const char *fmt = GET_RTX_FORMAT (code);
-  int i,j;
-  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
-    if (fmt[i] == 'e' || fmt[i] == 'u')
-      adjust_subreg_index (XEXP (op, i));
-    else if (fmt[i] == 'E')
-      for (j = 0; j < XVECLEN (op, i); ++j)
-	adjust_subreg_index (XVECEXP (op, i, j));
-}
-
-/* Convert the non-permuting load INSN to a permuting one.  */
-static void
-permute_load (rtx_insn *insn)
-{
-  rtx body = PATTERN (insn);
-  rtx mem_op = SET_SRC (body);
-  rtx tgt_reg = SET_DEST (body);
-  machine_mode mode = GET_MODE (tgt_reg);
-  int n_elts = GET_MODE_NUNITS (mode);
-  int half_elts = n_elts / 2;
-  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
-  int i, j;
-  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
-    XVECEXP (par, 0, i) = GEN_INT (j);
-  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
-    XVECEXP (par, 0, i) = GEN_INT (j);
-  rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
-  SET_SRC (body) = sel;
-  INSN_CODE (insn) = -1; /* Force re-recognition.  */
-  df_insn_rescan (insn);
-
-  if (dump_file)
-    fprintf (dump_file, "Replacing load %d with permuted load\n",
-	     INSN_UID (insn));
-}
-
-/* Convert the non-permuting store INSN to a permuting one.  */
-static void
-permute_store (rtx_insn *insn)
-{
-  rtx body = PATTERN (insn);
-  rtx src_reg = SET_SRC (body);
-  machine_mode mode = GET_MODE (src_reg);
-  int n_elts = GET_MODE_NUNITS (mode);
-  int half_elts = n_elts / 2;
-  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
-  int i, j;
-  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
-    XVECEXP (par, 0, i) = GEN_INT (j);
-  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
-    XVECEXP (par, 0, i) = GEN_INT (j);
-  rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
-  SET_SRC (body) = sel;
-  INSN_CODE (insn) = -1; /* Force re-recognition.  */
-  df_insn_rescan (insn);
-
-  if (dump_file)
-    fprintf (dump_file, "Replacing store %d with permuted store\n",
-	     INSN_UID (insn));
-}
-
-/* Given OP that contains a vector extract operation, adjust the index
-   of the extracted lane to account for the doubleword swap.  */
-static void
-adjust_extract (rtx_insn *insn)
-{
-  rtx pattern = PATTERN (insn);
-  if (GET_CODE (pattern) == PARALLEL)
-    pattern = XVECEXP (pattern, 0, 0);
-  rtx src = SET_SRC (pattern);
-  /* The vec_select may be wrapped in a vec_duplicate for a splat, so
-     account for that.  */
-  rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
-  rtx par = XEXP (sel, 1);
-  int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
-  int lane = INTVAL (XVECEXP (par, 0, 0));
-  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
-  XVECEXP (par, 0, 0) = GEN_INT (lane);
-  INSN_CODE (insn) = -1; /* Force re-recognition.  */
-  df_insn_rescan (insn);
-
-  if (dump_file)
-    fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
-}
-
-/* Given OP that contains a vector direct-splat operation, adjust the index
-   of the source lane to account for the doubleword swap.  */
-static void
-adjust_splat (rtx_insn *insn)
-{
-  rtx body = PATTERN (insn);
-  rtx unspec = XEXP (body, 1);
-  int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
-  int lane = INTVAL (XVECEXP (unspec, 0, 1));
-  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
-  XVECEXP (unspec, 0, 1) = GEN_INT (lane);
-  INSN_CODE (insn) = -1; /* Force re-recognition.  */
-  df_insn_rescan (insn);
-
-  if (dump_file)
-    fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
-}
-
-/* Given OP that contains an XXPERMDI operation (that is not a doubleword
-   swap), reverse the order of the source operands and adjust the indices
-   of the source lanes to account for doubleword reversal.  */
-static void
-adjust_xxpermdi (rtx_insn *insn)
-{
-  rtx set = PATTERN (insn);
-  rtx select = XEXP (set, 1);
-  rtx concat = XEXP (select, 0);
-  rtx src0 = XEXP (concat, 0);
-  XEXP (concat, 0) = XEXP (concat, 1);
-  XEXP (concat, 1) = src0;
-  rtx parallel = XEXP (select, 1);
-  int lane0 = INTVAL (XVECEXP (parallel, 0, 0));
-  int lane1 = INTVAL (XVECEXP (parallel, 0, 1));
-  int new_lane0 = 3 - lane1;
-  int new_lane1 = 3 - lane0;
-  XVECEXP (parallel, 0, 0) = GEN_INT (new_lane0);
-  XVECEXP (parallel, 0, 1) = GEN_INT (new_lane1);
-  INSN_CODE (insn) = -1; /* Force re-recognition.  */
-  df_insn_rescan (insn);
-
-  if (dump_file)
-    fprintf (dump_file, "Changing lanes for xxpermdi %d\n", INSN_UID (insn));
-}
-
-/* Given OP that contains a VEC_CONCAT operation of two doublewords,
-   reverse the order of those inputs.  */
-static void
-adjust_concat (rtx_insn *insn)
-{
-  rtx set = PATTERN (insn);
-  rtx concat = XEXP (set, 1);
-  rtx src0 = XEXP (concat, 0);
-  XEXP (concat, 0) = XEXP (concat, 1);
-  XEXP (concat, 1) = src0;
-  INSN_CODE (insn) = -1; /* Force re-recognition.  */
-  df_insn_rescan (insn);
-
-  if (dump_file)
-    fprintf (dump_file, "Reversing inputs for concat %d\n", INSN_UID (insn));
-}
-
-/* Given an UNSPEC_VPERM insn, modify the mask loaded from the
-   constant pool to reflect swapped doublewords.  */
-static void
-adjust_vperm (rtx_insn *insn)
-{
-  /* We previously determined that the UNSPEC_VPERM was fed by a
-     swap of a swapping load of a TOC-relative constant pool symbol.
-     Find the MEM in the swapping load and replace it with a MEM for
-     the adjusted mask constant.  */
-  rtx set = PATTERN (insn);
-  rtx mask_reg = XVECEXP (SET_SRC (set), 0, 2);
-
-  /* Find the swap.  */
-  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-  df_ref use;
-  rtx_insn *swap_insn = 0;
-  FOR_EACH_INSN_INFO_USE (use, insn_info)
-    if (rtx_equal_p (DF_REF_REG (use), mask_reg))
-      {
-	struct df_link *def_link = DF_REF_CHAIN (use);
-	gcc_assert (def_link && !def_link->next);
-	swap_insn = DF_REF_INSN (def_link->ref);
-	break;
-      }
-  gcc_assert (swap_insn);
-  
-  /* Find the load.  */
-  insn_info = DF_INSN_INFO_GET (swap_insn);
-  rtx_insn *load_insn = 0;
-  FOR_EACH_INSN_INFO_USE (use, insn_info)
-    {
-      struct df_link *def_link = DF_REF_CHAIN (use);
-      gcc_assert (def_link && !def_link->next);
-      load_insn = DF_REF_INSN (def_link->ref);
-      break;
-    }
-  gcc_assert (load_insn);
-
-  /* Find the TOC-relative symbol access.  */
-  insn_info = DF_INSN_INFO_GET (load_insn);
-  rtx_insn *tocrel_insn = 0;
-  FOR_EACH_INSN_INFO_USE (use, insn_info)
-    {
-      struct df_link *def_link = DF_REF_CHAIN (use);
-      gcc_assert (def_link && !def_link->next);
-      tocrel_insn = DF_REF_INSN (def_link->ref);
-      break;
-    }
-  gcc_assert (tocrel_insn);
-
-  /* Find the embedded CONST_VECTOR.  We have to call toc_relative_expr_p
-     to set tocrel_base; otherwise it would be unnecessary as we've
-     already established it will return true.  */
-  rtx base, offset;
-  const_rtx tocrel_base;
-  rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
-  /* There is an extra level of indirection for small/large code models.  */
-  if (GET_CODE (tocrel_expr) == MEM)
-    tocrel_expr = XEXP (tocrel_expr, 0);
-  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
-    gcc_unreachable ();
-  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
-  rtx const_vector = get_pool_constant (base);
-  /* With the extra indirection, get_pool_constant will produce the
-     real constant from the reg_equal expression, so get the real
-     constant.  */
-  if (GET_CODE (const_vector) == SYMBOL_REF)
-    const_vector = get_pool_constant (const_vector);
-  gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
-
-  /* Create an adjusted mask from the initial mask.  */
-  unsigned int new_mask[16], i, val;
-  for (i = 0; i < 16; ++i) {
-    val = INTVAL (XVECEXP (const_vector, 0, i));
-    if (val < 16)
-      new_mask[i] = (val + 8) % 16;
-    else
-      new_mask[i] = ((val + 8) % 16) + 16;
-  }
-
-  /* Create a new CONST_VECTOR and a MEM that references it.  */
-  rtx vals = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
-  for (i = 0; i < 16; ++i)
-    XVECEXP (vals, 0, i) = GEN_INT (new_mask[i]);
-  rtx new_const_vector = gen_rtx_CONST_VECTOR (V16QImode, XVEC (vals, 0));
-  rtx new_mem = force_const_mem (V16QImode, new_const_vector);
-  /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
-     can't recognize.  Force the SYMBOL_REF into a register.  */
-  if (!REG_P (XEXP (new_mem, 0))) {
-    rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
-    XEXP (new_mem, 0) = base_reg;
-    /* Move the newly created insn ahead of the load insn.  */
-    rtx_insn *force_insn = get_last_insn ();
-    remove_insn (force_insn);
-    rtx_insn *before_load_insn = PREV_INSN (load_insn);
-    add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn));
-    df_insn_rescan (before_load_insn);
-    df_insn_rescan (force_insn);
-  }
-
-  /* Replace the MEM in the load instruction and rescan it.  */
-  XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
-  INSN_CODE (load_insn) = -1; /* Force re-recognition.  */
-  df_insn_rescan (load_insn);
-
-  if (dump_file)
-    fprintf (dump_file, "Adjusting mask for vperm %d\n", INSN_UID (insn));
-}
-
-/* The insn described by INSN_ENTRY[I] can be swapped, but only
-   with special handling.  Take care of that here.  */
-static void
-handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
-{
-  rtx_insn *insn = insn_entry[i].insn;
-  rtx body = PATTERN (insn);
-
-  switch (insn_entry[i].special_handling)
-    {
-    default:
-      gcc_unreachable ();
-    case SH_CONST_VECTOR:
-      {
-	/* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
-	gcc_assert (GET_CODE (body) == SET);
-	rtx rhs = SET_SRC (body);
-	swap_const_vector_halves (rhs);
-	if (dump_file)
-	  fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
-	break;
-      }
-    case SH_SUBREG:
-      /* A subreg of the same size is already safe.  For subregs that
-	 select a smaller portion of a reg, adjust the index for
-	 swapped doublewords.  */
-      adjust_subreg_index (body);
-      if (dump_file)
-	fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
-      break;
-    case SH_NOSWAP_LD:
-      /* Convert a non-permuting load to a permuting one.  */
-      permute_load (insn);
-      break;
-    case SH_NOSWAP_ST:
-      /* Convert a non-permuting store to a permuting one.  */
-      permute_store (insn);
-      break;
-    case SH_EXTRACT:
-      /* Change the lane on an extract operation.  */
-      adjust_extract (insn);
-      break;
-    case SH_SPLAT:
-      /* Change the lane on a direct-splat operation.  */
-      adjust_splat (insn);
-      break;
-    case SH_XXPERMDI:
-      /* Change the lanes on an XXPERMDI operation.  */
-      adjust_xxpermdi (insn);
-      break;
-    case SH_CONCAT:
-      /* Reverse the order of a concatenation operation.  */
-      adjust_concat (insn);
-      break;
-    case SH_VPERM:
-      /* Change the mask loaded from the constant pool for a VPERM.  */
-      adjust_vperm (insn);
-      break;
-    }
-}
-
-/* Find the insn from the Ith table entry, which is known to be a
-   register swap Y = SWAP(X).  Replace it with a copy Y = X.  */
-static void
-replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
-{
-  rtx_insn *insn = insn_entry[i].insn;
-  rtx body = PATTERN (insn);
-  rtx src_reg = XEXP (SET_SRC (body), 0);
-  rtx copy = gen_rtx_SET (SET_DEST (body), src_reg);
-  rtx_insn *new_insn = emit_insn_before (copy, insn);
-  set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
-  df_insn_rescan (new_insn);
-
-  if (dump_file)
-    {
-      unsigned int new_uid = INSN_UID (new_insn);
-      fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
-    }
-
-  df_insn_delete (insn);
-  remove_insn (insn);
-  insn->set_deleted ();
-}
-
-/* Dump the swap table to DUMP_FILE.  */
-static void
-dump_swap_insn_table (swap_web_entry *insn_entry)
-{
-  int e = get_max_uid ();
-  fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
-
-  for (int i = 0; i < e; ++i)
-    if (insn_entry[i].is_relevant)
-      {
-	swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
-	fprintf (dump_file, "%6d %6d  ", i,
-		 pred_entry && pred_entry->insn
-		 ? INSN_UID (pred_entry->insn) : 0);
-	if (insn_entry[i].is_load)
-	  fputs ("load ", dump_file);
-	if (insn_entry[i].is_store)
-	  fputs ("store ", dump_file);
-	if (insn_entry[i].is_swap)
-	  fputs ("swap ", dump_file);
-	if (insn_entry[i].is_live_in)
-	  fputs ("live-in ", dump_file);
-	if (insn_entry[i].is_live_out)
-	  fputs ("live-out ", dump_file);
-	if (insn_entry[i].contains_subreg)
-	  fputs ("subreg ", dump_file);
-	if (insn_entry[i].is_128_int)
-	  fputs ("int128 ", dump_file);
-	if (insn_entry[i].is_call)
-	  fputs ("call ", dump_file);
-	if (insn_entry[i].is_swappable)
-	  {
-	    fputs ("swappable ", dump_file);
-	    if (insn_entry[i].special_handling == SH_CONST_VECTOR)
-	      fputs ("special:constvec ", dump_file);
-	    else if (insn_entry[i].special_handling == SH_SUBREG)
-	      fputs ("special:subreg ", dump_file);
-	    else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
-	      fputs ("special:load ", dump_file);
-	    else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
-	      fputs ("special:store ", dump_file);
-	    else if (insn_entry[i].special_handling == SH_EXTRACT)
-	      fputs ("special:extract ", dump_file);
-	    else if (insn_entry[i].special_handling == SH_SPLAT)
-	      fputs ("special:splat ", dump_file);
-	    else if (insn_entry[i].special_handling == SH_XXPERMDI)
-	      fputs ("special:xxpermdi ", dump_file);
-	    else if (insn_entry[i].special_handling == SH_CONCAT)
-	      fputs ("special:concat ", dump_file);
-	    else if (insn_entry[i].special_handling == SH_VPERM)
-	      fputs ("special:vperm ", dump_file);
-	  }
-	if (insn_entry[i].web_not_optimizable)
-	  fputs ("unoptimizable ", dump_file);
-	if (insn_entry[i].will_delete)
-	  fputs ("delete ", dump_file);
-	fputs ("\n", dump_file);
-      }
-  fputs ("\n", dump_file);
-}
-
-/* Return RTX with its address canonicalized to (reg) or (+ reg reg).
-   Here RTX is an (& addr (const_int -16)).  Always return a new copy
-   to avoid problems with combine.  */
-static rtx
-alignment_with_canonical_addr (rtx align)
-{
-  rtx canon;
-  rtx addr = XEXP (align, 0);
-
-  if (REG_P (addr))
-    canon = addr;
-
-  else if (GET_CODE (addr) == PLUS)
-    {
-      rtx addrop0 = XEXP (addr, 0);
-      rtx addrop1 = XEXP (addr, 1);
-
-      if (!REG_P (addrop0))
-	addrop0 = force_reg (GET_MODE (addrop0), addrop0);
-
-      if (!REG_P (addrop1))
-	addrop1 = force_reg (GET_MODE (addrop1), addrop1);
-
-      canon = gen_rtx_PLUS (GET_MODE (addr), addrop0, addrop1);
-    }
-
-  else
-    canon = force_reg (GET_MODE (addr), addr);
-
-  return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16));
-}
-
-/* Check whether an rtx is an alignment mask, and if so, return 
-   a fully-expanded rtx for the masking operation.  */
-static rtx
-alignment_mask (rtx_insn *insn)
-{
-  rtx body = PATTERN (insn);
-
-  if (GET_CODE (body) != SET
-      || GET_CODE (SET_SRC (body)) != AND
-      || !REG_P (XEXP (SET_SRC (body), 0)))
-    return 0;
-
-  rtx mask = XEXP (SET_SRC (body), 1);
-
-  if (GET_CODE (mask) == CONST_INT)
-    {
-      if (INTVAL (mask) == -16)
-	return alignment_with_canonical_addr (SET_SRC (body));
-      else
-	return 0;
-    }
-
-  if (!REG_P (mask))
-    return 0;
-
-  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-  df_ref use;
-  rtx real_mask = 0;
-
-  FOR_EACH_INSN_INFO_USE (use, insn_info)
-    {
-      if (!rtx_equal_p (DF_REF_REG (use), mask))
-	continue;
-
-      struct df_link *def_link = DF_REF_CHAIN (use);
-      if (!def_link || def_link->next)
-	return 0;
-
-      rtx_insn *const_insn = DF_REF_INSN (def_link->ref);
-      rtx const_body = PATTERN (const_insn);
-      if (GET_CODE (const_body) != SET)
-	return 0;
-
-      real_mask = SET_SRC (const_body);
-
-      if (GET_CODE (real_mask) != CONST_INT
-	  || INTVAL (real_mask) != -16)
-	return 0;
-    }
-
-  if (real_mask == 0)
-    return 0;
-
-  return alignment_with_canonical_addr (SET_SRC (body));
-}
-
-/* Given INSN that's a load or store based at BASE_REG, look for a
-   feeding computation that aligns its address on a 16-byte boundary.  */
-static rtx
-find_alignment_op (rtx_insn *insn, rtx base_reg)
-{
-  df_ref base_use;
-  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-  rtx and_operation = 0;
-
-  FOR_EACH_INSN_INFO_USE (base_use, insn_info)
-    {
-      if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
-	continue;
-
-      struct df_link *base_def_link = DF_REF_CHAIN (base_use);
-      if (!base_def_link || base_def_link->next)
-	break;
-
-      /* With stack-protector code enabled, and possibly in other
-	 circumstances, there may not be an associated insn for 
-	 the def.  */
-      if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
-	break;
-
-      rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref);
-      and_operation = alignment_mask (and_insn);
-      if (and_operation != 0)
-	break;
-    }
-
-  return and_operation;
-}
-
-struct del_info { bool replace; rtx_insn *replace_insn; };
-
-/* If INSN is the load for an lvx pattern, put it in canonical form.  */
-static void
-recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete)
-{
-  rtx body = PATTERN (insn);
-  gcc_assert (GET_CODE (body) == SET
-	      && GET_CODE (SET_SRC (body)) == VEC_SELECT
-	      && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM);
-
-  rtx mem = XEXP (SET_SRC (body), 0);
-  rtx base_reg = XEXP (mem, 0);
-
-  rtx and_operation = find_alignment_op (insn, base_reg);
-
-  if (and_operation != 0)
-    {
-      df_ref def;
-      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-      FOR_EACH_INSN_INFO_DEF (def, insn_info)
-	{
-	  struct df_link *link = DF_REF_CHAIN (def);
-	  if (!link || link->next)
-	    break;
-
-	  rtx_insn *swap_insn = DF_REF_INSN (link->ref);
-	  if (!insn_is_swap_p (swap_insn)
-	      || insn_is_load_p (swap_insn)
-	      || insn_is_store_p (swap_insn))
-	    break;
-
-	  /* Expected lvx pattern found.  Change the swap to
-	     a copy, and propagate the AND operation into the
-	     load.  */
-	  to_delete[INSN_UID (swap_insn)].replace = true;
-	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
-
-	  XEXP (mem, 0) = and_operation;
-	  SET_SRC (body) = mem;
-	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
-	  df_insn_rescan (insn);
-		  
-	  if (dump_file)
-	    fprintf (dump_file, "lvx opportunity found at %d\n",
-		     INSN_UID (insn));
-	}
-    }
-}
-
-/* If INSN is the store for an stvx pattern, put it in canonical form.  */
-static void
-recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete)
-{
-  rtx body = PATTERN (insn);
-  gcc_assert (GET_CODE (body) == SET
-	      && GET_CODE (SET_DEST (body)) == MEM
-	      && GET_CODE (SET_SRC (body)) == VEC_SELECT);
-  rtx mem = SET_DEST (body);
-  rtx base_reg = XEXP (mem, 0);
-
-  rtx and_operation = find_alignment_op (insn, base_reg);
-
-  if (and_operation != 0)
-    {
-      rtx src_reg = XEXP (SET_SRC (body), 0);
-      df_ref src_use;
-      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-      FOR_EACH_INSN_INFO_USE (src_use, insn_info)
-	{
-	  if (!rtx_equal_p (DF_REF_REG (src_use), src_reg))
-	    continue;
-
-	  struct df_link *link = DF_REF_CHAIN (src_use);
-	  if (!link || link->next)
-	    break;
-
-	  rtx_insn *swap_insn = DF_REF_INSN (link->ref);
-	  if (!insn_is_swap_p (swap_insn)
-	      || insn_is_load_p (swap_insn)
-	      || insn_is_store_p (swap_insn))
-	    break;
-
-	  /* Expected stvx pattern found.  Change the swap to
-	     a copy, and propagate the AND operation into the
-	     store.  */
-	  to_delete[INSN_UID (swap_insn)].replace = true;
-	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
-
-	  XEXP (mem, 0) = and_operation;
-	  SET_SRC (body) = src_reg;
-	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
-	  df_insn_rescan (insn);
-		  
-	  if (dump_file)
-	    fprintf (dump_file, "stvx opportunity found at %d\n",
-		     INSN_UID (insn));
-	}
-    }
-}
-
-/* Look for patterns created from builtin lvx and stvx calls, and
-   canonicalize them to be properly recognized as such.  */
-static void
-recombine_lvx_stvx_patterns (function *fun)
-{
-  int i;
-  basic_block bb;
-  rtx_insn *insn;
-
-  int num_insns = get_max_uid ();
-  del_info *to_delete = XCNEWVEC (del_info, num_insns);
-
-  FOR_ALL_BB_FN (bb, fun)
-    FOR_BB_INSNS (bb, insn)
-    {
-      if (!NONDEBUG_INSN_P (insn))
-	continue;
-
-      if (insn_is_load_p (insn) && insn_is_swap_p (insn))
-	recombine_lvx_pattern (insn, to_delete);
-      else if (insn_is_store_p (insn) && insn_is_swap_p (insn))
-	recombine_stvx_pattern (insn, to_delete);
-    }
-
-  /* Turning swaps into copies is delayed until now, to avoid problems
-     with deleting instructions during the insn walk.  */
-  for (i = 0; i < num_insns; i++)
-    if (to_delete[i].replace)
-      {
-	rtx swap_body = PATTERN (to_delete[i].replace_insn);
-	rtx src_reg = XEXP (SET_SRC (swap_body), 0);
-	rtx copy = gen_rtx_SET (SET_DEST (swap_body), src_reg);
-	rtx_insn *new_insn = emit_insn_before (copy,
-					       to_delete[i].replace_insn);
-	set_block_for_insn (new_insn,
-			    BLOCK_FOR_INSN (to_delete[i].replace_insn));
-	df_insn_rescan (new_insn);
-	df_insn_delete (to_delete[i].replace_insn);
-	remove_insn (to_delete[i].replace_insn);
-	to_delete[i].replace_insn->set_deleted ();
-      }
-  
-  free (to_delete);
-}
-
-/* Main entry point for this pass.  */
-unsigned int
-rs6000_analyze_swaps (function *fun)
-{
-  swap_web_entry *insn_entry;
-  basic_block bb;
-  rtx_insn *insn, *curr_insn = 0;
-
-  /* Dataflow analysis for use-def chains.  */
-  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
-  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
-  df_analyze ();
-  df_set_flags (DF_DEFER_INSN_RESCAN);
-
-  /* Pre-pass to recombine lvx and stvx patterns so we don't lose info.  */
-  recombine_lvx_stvx_patterns (fun);
-
-  /* Allocate structure to represent webs of insns.  */
-  insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
-
-  /* Walk the insns to gather basic data.  */
-  FOR_ALL_BB_FN (bb, fun)
-    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
-    {
-      unsigned int uid = INSN_UID (insn);
-      if (NONDEBUG_INSN_P (insn))
-	{
-	  insn_entry[uid].insn = insn;
-
-	  if (GET_CODE (insn) == CALL_INSN)
-	    insn_entry[uid].is_call = 1;
-
-	  /* Walk the uses and defs to see if we mention vector regs.
-	     Record any constraints on optimization of such mentions.  */
-	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-	  df_ref mention;
-	  FOR_EACH_INSN_INFO_USE (mention, insn_info)
-	    {
-	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
-	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
-
-	      /* If a use gets its value from a call insn, it will be
-		 a hard register and will look like (reg:V4SI 3 3).
-		 The df analysis creates two mentions for GPR3 and GPR4,
-		 both DImode.  We must recognize this and treat it as a
-		 vector mention to ensure the call is unioned with this
-		 use.  */
-	      if (mode == DImode && DF_REF_INSN_INFO (mention))
-		{
-		  rtx feeder = DF_REF_INSN (mention);
-		  /* FIXME:  It is pretty hard to get from the df mention
-		     to the mode of the use in the insn.  We arbitrarily
-		     pick a vector mode here, even though the use might
-		     be a real DImode.  We can be too conservative
-		     (create a web larger than necessary) because of
-		     this, so consider eventually fixing this.  */
-		  if (GET_CODE (feeder) == CALL_INSN)
-		    mode = V4SImode;
-		}
-
-	      if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
-		{
-		  insn_entry[uid].is_relevant = 1;
-		  if (mode == TImode || mode == V1TImode
-		      || FLOAT128_VECTOR_P (mode))
-		    insn_entry[uid].is_128_int = 1;
-		  if (DF_REF_INSN_INFO (mention))
-		    insn_entry[uid].contains_subreg
-		      = !rtx_equal_p (DF_REF_REG (mention),
-				      DF_REF_REAL_REG (mention));
-		  union_defs (insn_entry, insn, mention);
-		}
-	    }
-	  FOR_EACH_INSN_INFO_DEF (mention, insn_info)
-	    {
-	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
-	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
-
-	      /* If we're loading up a hard vector register for a call,
-		 it looks like (set (reg:V4SI 9 9) (...)).  The df
-		 analysis creates two mentions for GPR9 and GPR10, both
-		 DImode.  So relying on the mode from the mentions
-		 isn't sufficient to ensure we union the call into the
-		 web with the parameter setup code.  */
-	      if (mode == DImode && GET_CODE (insn) == SET
-		  && ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (SET_DEST (insn))))
-		mode = GET_MODE (SET_DEST (insn));
-
-	      if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
-		{
-		  insn_entry[uid].is_relevant = 1;
-		  if (mode == TImode || mode == V1TImode
-		      || FLOAT128_VECTOR_P (mode))
-		    insn_entry[uid].is_128_int = 1;
-		  if (DF_REF_INSN_INFO (mention))
-		    insn_entry[uid].contains_subreg
-		      = !rtx_equal_p (DF_REF_REG (mention),
-				      DF_REF_REAL_REG (mention));
-		  /* REG_FUNCTION_VALUE_P is not valid for subregs. */
-		  else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
-		    insn_entry[uid].is_live_out = 1;
-		  union_uses (insn_entry, insn, mention);
-		}
-	    }
-
-	  if (insn_entry[uid].is_relevant)
-	    {
-	      /* Determine if this is a load or store.  */
-	      insn_entry[uid].is_load = insn_is_load_p (insn);
-	      insn_entry[uid].is_store = insn_is_store_p (insn);
-
-	      /* Determine if this is a doubleword swap.  If not,
-		 determine whether it can legally be swapped.  */
-	      if (insn_is_swap_p (insn))
-		insn_entry[uid].is_swap = 1;
-	      else
-		{
-		  unsigned int special = SH_NONE;
-		  insn_entry[uid].is_swappable
-		    = insn_is_swappable_p (insn_entry, insn, &special);
-		  if (special != SH_NONE && insn_entry[uid].contains_subreg)
-		    insn_entry[uid].is_swappable = 0;
-		  else if (special != SH_NONE)
-		    insn_entry[uid].special_handling = special;
-		  else if (insn_entry[uid].contains_subreg)
-		    insn_entry[uid].special_handling = SH_SUBREG;
-		}
-	    }
-	}
-    }
-
-  if (dump_file)
-    {
-      fprintf (dump_file, "\nSwap insn entry table when first built\n");
-      dump_swap_insn_table (insn_entry);
-    }
-
-  /* Record unoptimizable webs.  */
-  unsigned e = get_max_uid (), i;
-  for (i = 0; i < e; ++i)
-    {
-      if (!insn_entry[i].is_relevant)
-	continue;
-
-      swap_web_entry *root
-	= (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
-
-      if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
-	  || (insn_entry[i].contains_subreg
-	      && insn_entry[i].special_handling != SH_SUBREG)
-	  || insn_entry[i].is_128_int || insn_entry[i].is_call
-	  || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
-	root->web_not_optimizable = 1;
-
-      /* If we have loads or stores that aren't permuting then the
-	 optimization isn't appropriate.  */
-      else if ((insn_entry[i].is_load || insn_entry[i].is_store)
-	  && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
-	root->web_not_optimizable = 1;
-
-      /* If we have permuting loads or stores that are not accompanied
-	 by a register swap, the optimization isn't appropriate.  */
-      else if (insn_entry[i].is_load && insn_entry[i].is_swap)
-	{
-	  rtx insn = insn_entry[i].insn;
-	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-	  df_ref def;
-
-	  FOR_EACH_INSN_INFO_DEF (def, insn_info)
-	    {
-	      struct df_link *link = DF_REF_CHAIN (def);
-
-	      if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
-		{
-		  root->web_not_optimizable = 1;
-		  break;
-		}
-	    }
-	}
-      else if (insn_entry[i].is_store && insn_entry[i].is_swap)
-	{
-	  rtx insn = insn_entry[i].insn;
-	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-	  df_ref use;
-
-	  FOR_EACH_INSN_INFO_USE (use, insn_info)
-	    {
-	      struct df_link *link = DF_REF_CHAIN (use);
-
-	      if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
-		{
-		  root->web_not_optimizable = 1;
-		  break;
-		}
-	    }
-	}
-    }
-
-  if (dump_file)
-    {
-      fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
-      dump_swap_insn_table (insn_entry);
-    }
-
-  /* For each load and store in an optimizable web (which implies
-     the loads and stores are permuting), find the associated
-     register swaps and mark them for removal.  Due to various
-     optimizations we may mark the same swap more than once.  Also
-     perform special handling for swappable insns that require it.  */
-  for (i = 0; i < e; ++i)
-    if ((insn_entry[i].is_load || insn_entry[i].is_store)
-	&& insn_entry[i].is_swap)
-      {
-	swap_web_entry* root_entry
-	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
-	if (!root_entry->web_not_optimizable)
-	  mark_swaps_for_removal (insn_entry, i);
-      }
-    else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
-      {
-	swap_web_entry* root_entry
-	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
-	if (!root_entry->web_not_optimizable)
-	  handle_special_swappables (insn_entry, i);
-      }
-
-  /* Now delete the swaps marked for removal.  */
-  for (i = 0; i < e; ++i)
-    if (insn_entry[i].will_delete)
-      replace_swap_with_copy (insn_entry, i);
-
-  /* Clean up.  */
-  free (insn_entry);
-  return 0;
-}
-
-const pass_data pass_data_analyze_swaps =
-{
-  RTL_PASS, /* type */
-  "swaps", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_NONE, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  TODO_df_finish, /* todo_flags_finish */
-};
-
-class pass_analyze_swaps : public rtl_opt_pass
-{
-public:
-  pass_analyze_swaps(gcc::context *ctxt)
-    : rtl_opt_pass(pass_data_analyze_swaps, ctxt)
-  {}
-
-  /* opt_pass methods: */
-  virtual bool gate (function *)
-    {
-      return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
-	      && !TARGET_P9_VECTOR && rs6000_optimize_swaps);
-    }
-
-  virtual unsigned int execute (function *fun)
-    {
-      return rs6000_analyze_swaps (fun);
-    }
-
-  opt_pass *clone ()
-    {
-      return new pass_analyze_swaps (m_ctxt);
-    }
-
-}; // class pass_analyze_swaps
-
-rtl_opt_pass *
-make_pass_analyze_swaps (gcc::context *ctxt)
-{
-  return new pass_analyze_swaps (ctxt);
-}
-
 #ifdef RS6000_GLIBC_ATOMIC_FENV
 /* Function declarations for rs6000_atomic_assign_expand_fenv.  */
 static tree atomic_hold_decl, atomic_clear_decl, atomic_update_decl;
Index: gcc/config/rs6000/t-rs6000
===================================================================
--- gcc/config/rs6000/t-rs6000	(revision 250005)
+++ gcc/config/rs6000/t-rs6000	(working copy)
@@ -30,6 +30,10 @@ 
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
+rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
 $(srcdir)/config/rs6000/rs6000-tables.opt: $(srcdir)/config/rs6000/genopt.sh \
   $(srcdir)/config/rs6000/rs6000-cpus.def
 	$(SHELL) $(srcdir)/config/rs6000/genopt.sh $(srcdir)/config/rs6000 > \
Index: gcc/config.gcc
===================================================================
--- gcc/config.gcc	(revision 250005)
+++ gcc/config.gcc	(working copy)
@@ -454,7 +454,7 @@ 
 	;;
 powerpc*-*-*)
 	cpu_type=rs6000
-	extra_objs="rs6000-string.o"
+	extra_objs="rs6000-string.o rs6000-p8swap.o"
 	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
 	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
 	extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
@@ -472,7 +472,7 @@ 
 	;;
 rs6000*-*-*)
 	extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
-	extra_objs="rs6000-string.o"
+	extra_objs="rs6000-string.o rs6000-p8swap.o"
 	;;
 sparc*-*-*)
 	cpu_type=sparc