diff mbox

[02/10] mips: Implement vec_perm_const.

Message ID 1324486822-18225-3-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson Dec. 21, 2011, 5 p.m. UTC
---
 gcc/config/mips/loongson.h     |    4 +-
 gcc/config/mips/loongson.md    |  200 ++++++++++++++++++++++++++---------
 gcc/config/mips/mips-modes.def |   12 ++-
 gcc/config/mips/mips-protos.h  |    1 +
 gcc/config/mips/mips-ps-3d.md  |  225 +++++++++++++++++++++++++++++----------
 gcc/config/mips/mips.c         |  228 ++++++++++++++++++++++++++++++++++++++--
 gcc/config/mips/predicates.md  |   11 ++-
 7 files changed, 557 insertions(+), 124 deletions(-)

Comments

Richard Sandiford Dec. 22, 2011, 7:33 p.m. UTC | #1
Looks good, but with this:

Richard Henderson <rth@redhat.com> writes:
> +(define_insn "*vec_setv4hi"
> +  [(set (match_operand:V4HI 0 "register_operand" "=f")
> +	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
> +		      (match_operand:SI 2 "register_operand" "f")
> +		      (match_operand:SI 3 "const_0_to_3_operand" "")]
> +		     UNSPEC_LOONGSON_PINSRH))]
>    "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
> -  "pinsr<V_suffix>_3\t%0,%1,%2"
> +  "pinsrh_%3\t%0,%1,%2"
>    [(set_attr "type" "fdiv")])
>  
> +(define_expand "vec_setv4hi"
> +  [(set (match_operand:V4HI 0 "register_operand" "=f")
> +	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
> +		      (match_operand:HI 2 "register_operand" "f")
> +		      (match_operand:SI 3 "const_0_to_3_operand" "")]
> +		     UNSPEC_LOONGSON_PINSRH))]
> +  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
> +{
> +  rtx ext = gen_reg_rtx (SImode);
> +  emit_move_insn (ext, gen_lowpart (SImode, operands[1]));
> +  operands[1] = ext;
> +})

was it simply the mode punning from HI to V4HI that stops us from
using the loongson_pinsr<V_suffix>_N insns?  E.g. something like:

(define_expand "vec_setv4hi"
  [(set (match_operand:V4HI 0 "register_operand" "=f")
	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
		      (match_operand:HI 2 "register_operand" "f")
		      (match_operand:SI 3 "const_0_to_3_operand" "")]
		     UNSPEC_LOONGSON_PINSRH))]
  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
{
  unsigned char perm[4];
  int i;

  for (i = 0; i < 4; i++)
    perm[i] = (i == INTVAL (operands[3]) ? 4 : i);
  operands[2] = copy_to_reg (gen_lowpart (V4HImode, operands[2]));
  if (!mips_expand_vselect_vconcat (operands[0], operands[1], operands[2],
				    perm, 4))
    gcc_unreachable ();
  DONE;
})

I realise this isn't exactly simpler in code terms, but it avoids the
unspec and avoids the dual mode on match_operand 2 (HImode for matching,
SImode for generation).  And with HImode not being valid for FPRs,
I'd have expected:

    (subreg:V4HI (reg:HI R) 0)

to be as "bad" as:

    (subreg:SI (reg:HI R) 0)

> diff --git a/gcc/config/mips/mips-modes.def b/gcc/config/mips/mips-modes.def
> index b9c508b..85861a9 100644
> --- a/gcc/config/mips/mips-modes.def
> +++ b/gcc/config/mips/mips-modes.def
> @@ -26,9 +26,15 @@ RESET_FLOAT_FORMAT (DF, mips_double_format);
>  FLOAT_MODE (TF, 16, mips_quad_format);
>  
>  /* Vector modes.  */
> -VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
> -VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
> -VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
> +VECTOR_MODES (INT, 8);        /*       V8QI  V4HI V2SI */
> +VECTOR_MODES (FLOAT, 8);      /*             V4HF V2SF */
> +VECTOR_MODES (INT, 4);        /*             V4QI V2HI */

Not sure about this bit.

> +/* Double-sized vector modes for vec_concat.  */
> +VECTOR_MODE (INT, QI, 16);
> +VECTOR_MODE (INT, HI, 8);
> +VECTOR_MODE (INT, SI, 4);
> +VECTOR_MODE (FLOAT, SF, 4);

It occured to me later that there might be an ABI impact with this
on n32 and n64, due to the historical mistake of defining things
in terms of modes (partly for the sake of libcalls):

      /* Scalar, complex and vector floating-point types are passed in
	 floating-point registers, as long as this is a named rather
	 than a variable argument.  */
      info->fpr_p = (named
		     && (type == 0 || FLOAT_TYPE_P (type))
		     && (GET_MODE_CLASS (mode) == MODE_FLOAT
			 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
			 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
		     && GET_MODE_UNIT_SIZE (mode) <= UNITS_PER_FPVALUE);

UNITS_PER_FPVALUE is 16 because of the 128-bit long double.  I think every:

    GET_MODE_CLASS (...) == MODE_VECTOR_FLOAT

in mips.c needs to be changed to "mode == V2SFmode".

> +/* Construct (set target (vec_select op0 (parallel perm))) and
> +   return true if that's a valid instruction in the active ISA.  */
> +
> +static bool
> +expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)

Stupid nitpick, but there's a mixture of statics without "mips_" and
statics with.  Would be nice to have them all as all "mips_", just in case.

> +/* Construct (set target (vec_select op0 (parallel perm))) and
> +   return true if that's a valid instruction in the active ISA.  */
> +
> +static bool
> +expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
> +{
> +  rtx rperm[MAX_VECT_LEN], x;
> +  unsigned i;
> +
> +  for (i = 0; i < nelt; ++i)
> +    rperm[i] = GEN_INT (perm[i]);
> +
> +  x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
> +  x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
> +  x = gen_rtx_SET (VOIDmode, target, x);
> +
> +  x = emit_insn (x);
> +  if (recog_memoized (x) < 0)
> +    {
> +      remove_insn (x);
> +      return false;
> +    }
> +  return true;
> +}

Is there an explicit target-independent requirement that
TARGET_VECTORIZE_VEC_PERM_CONST_OK is only called in a
context where insns can be emitted?

Reload seems to get by with make_insn_raw.  Maybe we could add
a "testing?" parameter and only call add_insn if the parameter
is false, and if recog_memoized succeeds.

> +      /* The backend (vec_select (vec_concat)) patterns are not duplicated
> +	 for single-operand.  Try once with the original un-folded selector. */
> +      if (mips_expand_vec_perm_const_1 (&d))
> +	return true;

What sort of input does this handle, and why don't we want the same thing
for case 1 & 2 single vectors?

Richard
Richard Henderson Dec. 22, 2011, 8:45 p.m. UTC | #2
On 12/22/2011 11:33 AM, Richard Sandiford wrote:
> was it simply the mode punning from HI to V4HI that stops us from
> using the loongson_pinsr<V_suffix>_N insns?  E.g. something like:

Mostly...

> I realise this isn't exactly simpler in code terms, but it avoids the
> unspec and avoids the dual mode on match_operand 2 (HImode for matching,
> SImode for generation).  And with HImode not being valid for FPRs,
> I'd have expected:
> 
>     (subreg:V4HI (reg:HI R) 0)
> 
> to be as "bad" as:
> 
>     (subreg:SI (reg:HI R) 0)

Well, for one thing, note that I actually copy that subreg into a real
SImode pseudo.  So that sub-reggy-ness is fairly well forced to happen
in the integer registers.  At which point the SImode value is perfectly
happy to be copied into the fp regs.

The only other alternative that I see is to do the same with DImode
and then subreg from there to V4HImode.  While I only tested mips64el,
I would guess that using DImode would perform worse in 32-bit mode.


>>  /* Vector modes.  */
>> -VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
>> -VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
>> -VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
>> +VECTOR_MODES (INT, 8);        /*       V8QI  V4HI V2SI */
>> +VECTOR_MODES (FLOAT, 8);      /*             V4HF V2SF */
>> +VECTOR_MODES (INT, 4);        /*             V4QI V2HI */
> 
> Not sure about this bit.

That's just re-aligning the comment columns so that V16QI fits.

>> +/* Double-sized vector modes for vec_concat.  */
>> +VECTOR_MODE (INT, QI, 16);
>> +VECTOR_MODE (INT, HI, 8);
>> +VECTOR_MODE (INT, SI, 4);
>> +VECTOR_MODE (FLOAT, SF, 4);
> 
> It occured to me later that there might be an ABI impact with this
> on n32 and n64, due to the historical mistake of defining things
> in terms of modes (partly for the sake of libcalls):

Hmm.  Except that since mips_vector_mode_supported_p disallows
V4SFmode, you'll never wind up with variables of that type.

If the mips folk ever come up with another isa extension that
operates on larger register sets, this will make a difference...

>> +/* Construct (set target (vec_select op0 (parallel perm))) and
>> +   return true if that's a valid instruction in the active ISA.  */
>> +
>> +static bool
>> +expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
> 
> Stupid nitpick, but there's a mixture of statics without "mips_" and
> statics with.  Would be nice to have them all as all "mips_", just in case.

Ok.

> Is there an explicit target-independent requirement that
> TARGET_VECTORIZE_VEC_PERM_CONST_OK is only called in a
> context where insns can be emitted?

We start a sequence inside vec_perm_const_ok that prevents emit_insn
from actually doing anything.  It works everwhere else...

>> +      /* The backend (vec_select (vec_concat)) patterns are not duplicated
>> +	 for single-operand.  Try once with the original un-folded selector. */
>> +      if (mips_expand_vec_perm_const_1 (&d))
>> +	return true;
> 
> What sort of input does this handle, and why don't we want the same thing
> for case 1 & 2 single vectors?

  V4HI a;
  __builtin_select (a, a, (V4HI){ 0, 4, 1, 5 })

When we called vec_perm_const_ok, we looked at the elements of the selector
and saw that they come from two different operands.  So we performed the test
with op0 != op1 and one_operand_p = false.

Here in expand_vec_perm_const, we see that a == a and want to fold to  { 0, 0, 1, 1 }.
Well, this caused us problems in the i386 port where we had some double-operand
patterns that didn't match a single-operand pattern.

... of course on the i386 port we try the simplified version first.  Which really
makes more sense than this.  Not sure what I was thinking...


r~
Richard Sandiford Dec. 22, 2011, 9:16 p.m. UTC | #3
Richard Henderson <rth@redhat.com> writes:
> On 12/22/2011 11:33 AM, Richard Sandiford wrote:
>> was it simply the mode punning from HI to V4HI that stops us from
>> using the loongson_pinsr<V_suffix>_N insns?  E.g. something like:
>
> Mostly...
>
>> I realise this isn't exactly simpler in code terms, but it avoids the
>> unspec and avoids the dual mode on match_operand 2 (HImode for matching,
>> SImode for generation).  And with HImode not being valid for FPRs,
>> I'd have expected:
>> 
>>     (subreg:V4HI (reg:HI R) 0)
>> 
>> to be as "bad" as:
>> 
>>     (subreg:SI (reg:HI R) 0)
>
> Well, for one thing, note that I actually copy that subreg into a real
> SImode pseudo.  So that sub-reggy-ness is fairly well forced to happen
> in the integer registers.  At which point the SImode value is perfectly
> happy to be copied into the fp regs.

You mean the register allocator will prefer a GPR for the SImode pseudo,
so the need for one won't be hidden in an input reload of the subreg?
Not sure -- maybe it would get a GPR, but with one definition as "d"
and one use as "f", I'm not sure we can really rely on that.

> The only other alternative that I see is to do the same with DImode
> and then subreg from there to V4HImode.  While I only tested mips64el,
> I would guess that using DImode would perform worse in 32-bit mode.

Well, I wasn't thinking of using DImode, more going direct from
HImode to V4HImode, but I suppose the same thing would apply for
the subreg reload.  Never mind then.

>>>  /* Vector modes.  */
>>> -VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
>>> -VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
>>> -VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
>>> +VECTOR_MODES (INT, 8);        /*       V8QI  V4HI V2SI */
>>> +VECTOR_MODES (FLOAT, 8);      /*             V4HF V2SF */
>>> +VECTOR_MODES (INT, 4);        /*             V4QI V2HI */
>> 
>> Not sure about this bit.
>
> That's just re-aligning the comment columns so that V16QI fits.

OK, but there wasn't a V16QI comment that I could see.

>>> +/* Double-sized vector modes for vec_concat.  */
>>> +VECTOR_MODE (INT, QI, 16);
>>> +VECTOR_MODE (INT, HI, 8);
>>> +VECTOR_MODE (INT, SI, 4);
>>> +VECTOR_MODE (FLOAT, SF, 4);
>> 
>> It occured to me later that there might be an ABI impact with this
>> on n32 and n64, due to the historical mistake of defining things
>> in terms of modes (partly for the sake of libcalls):
>
> Hmm.  Except that since mips_vector_mode_supported_p disallows
> V4SFmode, you'll never wind up with variables of that type.
>
> If the mips folk ever come up with another isa extension that
> operates on larger register sets, this will make a difference...

I'd still rather be safe than sorry when it comes to ABI stuff.
We've defined the mode, and we've long allowed users to define:

    typedef float v4sf __attribute__((vector_size(16)));

regardless of backend support.  I realise we shouldn't be matching
the two up in the absence of bugs, but...

>> Is there an explicit target-independent requirement that
>> TARGET_VECTORIZE_VEC_PERM_CONST_OK is only called in a
>> context where insns can be emitted?
>
> We start a sequence inside vec_perm_const_ok that prevents emit_insn
> from actually doing anything.  It works everwhere else...

OK.

>>> +      /* The backend (vec_select (vec_concat)) patterns are not duplicated
>>> +	 for single-operand.  Try once with the original un-folded selector. */
>>> +      if (mips_expand_vec_perm_const_1 (&d))
>>> +	return true;
>> 
>> What sort of input does this handle, and why don't we want the same thing
>> for case 1 & 2 single vectors?
>
>   V4HI a;
>   __builtin_select (a, a, (V4HI){ 0, 4, 1, 5 })
>
> When we called vec_perm_const_ok, we looked at the elements of the selector
> and saw that they come from two different operands.  So we performed the test
> with op0 != op1 and one_operand_p = false.
>
> Here in expand_vec_perm_const, we see that a == a and want to fold to
> { 0, 0, 1, 1 }.  Well, this caused us problems in the i386 port where
> we had some double-operand patterns that didn't match a single-operand
> pattern.

OK, but just so that I understand: does that mean we're missing some
single-operand cases that ought to be there, and this code is just
stopping us from ICEing when we hit them?  Would it be worth having
a gcc_checking_assert there?

Richard
Richard Henderson Dec. 22, 2011, 9:49 p.m. UTC | #4
On 12/22/2011 01:16 PM, Richard Sandiford wrote:
>> That's just re-aligning the comment columns so that V16QI fits.
> 
> OK, but there wasn't a V16QI comment that I could see.

Heh.  It was supposed to be on ...

>>>> +/* Double-sized vector modes for vec_concat.  */
>>>> +VECTOR_MODE (INT, QI, 16);
>>>> +VECTOR_MODE (INT, HI, 8);
>>>> +VECTOR_MODE (INT, SI, 4);
>>>> +VECTOR_MODE (FLOAT, SF, 4);

... these.  I wonder where those got lost.

> I'd still rather be safe than sorry when it comes to ABI stuff.

Ok.

> OK, but just so that I understand: does that mean we're missing some
> single-operand cases that ought to be there, and this code is just
> stopping us from ICEing when we hit them?  Would it be worth having
> a gcc_checking_assert there?

I don't know if we're missing any cases.  We may not be; the patterns
we're using here are certainly simpler than those on x86.

I can certainly add an assert...


r~
diff mbox

Patch

diff --git a/gcc/config/mips/loongson.h b/gcc/config/mips/loongson.h
index 6bfd4d7..fcaf553 100644
--- a/gcc/config/mips/loongson.h
+++ b/gcc/config/mips/loongson.h
@@ -449,13 +449,13 @@  psadbh (uint8x8_t s, uint8x8_t t)
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 pshufh_u (uint16x4_t dest, uint16x4_t s, uint8_t order)
 {
-  return __builtin_loongson_pshufh_u (dest, s, order);
+  return __builtin_loongson_pshufh_u (s, order);
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 pshufh_s (int16x4_t dest, int16x4_t s, uint8_t order)
 {
-  return __builtin_loongson_pshufh_s (dest, s, order);
+  return __builtin_loongson_pshufh_s (s, order);
 }
 
 /* Shift left logical.  */
diff --git a/gcc/config/mips/loongson.md b/gcc/config/mips/loongson.md
index 225f4d1..1b1fe0b 100644
--- a/gcc/config/mips/loongson.md
+++ b/gcc/config/mips/loongson.md
@@ -24,10 +24,7 @@ 
   UNSPEC_LOONGSON_PCMPEQ
   UNSPEC_LOONGSON_PCMPGT
   UNSPEC_LOONGSON_PEXTR
-  UNSPEC_LOONGSON_PINSR_0
-  UNSPEC_LOONGSON_PINSR_1
-  UNSPEC_LOONGSON_PINSR_2
-  UNSPEC_LOONGSON_PINSR_3
+  UNSPEC_LOONGSON_PINSRH
   UNSPEC_LOONGSON_PMADD
   UNSPEC_LOONGSON_PMOVMSK
   UNSPEC_LOONGSON_PMULHU
@@ -231,52 +228,87 @@ 
   [(set_attr "type" "fadd")])
 
 ;; Extract halfword.
-(define_insn "loongson_pextr<V_suffix>"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
- 		    (match_operand:SI 2 "register_operand" "f")]
+(define_insn "loongson_pextrh"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+        (unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
+		      (match_operand:SI 2 "register_operand" "f")]
 		   UNSPEC_LOONGSON_PEXTR))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pextr<V_suffix>\t%0,%1,%2"
+  "pextrh\t%0,%1,%2"
   [(set_attr "type" "fmul")])
 
 ;; Insert halfword.
-(define_insn "loongson_pinsr<V_suffix>_0"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-		    (match_operand:VH 2 "register_operand" "f")]
-		   UNSPEC_LOONGSON_PINSR_0))]
+(define_insn "loongson_pinsrh_0"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "f")
+	    (match_operand:V4HI 2 "register_operand" "f"))
+	  (parallel [(const_int 4) (const_int 1)
+		     (const_int 2) (const_int 3)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "pinsrh_0\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_pinsrh_1"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "f")
+	    (match_operand:V4HI 2 "register_operand" "f"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 3)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pinsr<V_suffix>_0\t%0,%1,%2"
+  "pinsrh_1\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
-(define_insn "loongson_pinsr<V_suffix>_1"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-		    (match_operand:VH 2 "register_operand" "f")]
-		   UNSPEC_LOONGSON_PINSR_1))]
+(define_insn "loongson_pinsrh_2"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "f")
+	    (match_operand:V4HI 2 "register_operand" "f"))
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 4) (const_int 3)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pinsr<V_suffix>_1\t%0,%1,%2"
+  "pinsrh_2\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
-(define_insn "loongson_pinsr<V_suffix>_2"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-		    (match_operand:VH 2 "register_operand" "f")]
-		   UNSPEC_LOONGSON_PINSR_2))]
+(define_insn "loongson_pinsrh_3"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "f")
+	    (match_operand:V4HI 2 "register_operand" "f"))
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 4)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pinsr<V_suffix>_2\t%0,%1,%2"
+  "pinsrh_3\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
-(define_insn "loongson_pinsr<V_suffix>_3"
-  [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
-		    (match_operand:VH 2 "register_operand" "f")]
-		   UNSPEC_LOONGSON_PINSR_3))]
+(define_insn "*vec_setv4hi"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
+		      (match_operand:SI 2 "register_operand" "f")
+		      (match_operand:SI 3 "const_0_to_3_operand" "")]
+		     UNSPEC_LOONGSON_PINSRH))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pinsr<V_suffix>_3\t%0,%1,%2"
+  "pinsrh_%3\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
+(define_expand "vec_setv4hi"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(unspec:V4HI [(match_operand:V4HI 1 "register_operand" "f")
+		      (match_operand:HI 2 "register_operand" "f")
+		      (match_operand:SI 3 "const_0_to_3_operand" "")]
+		     UNSPEC_LOONGSON_PINSRH))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  rtx ext = gen_reg_rtx (SImode);
+  emit_move_insn (ext, gen_lowpart (SImode, operands[1]));
+  operands[1] = ext;
+})
+
 ;; Multiply and add packed integers.
 (define_insn "loongson_pmadd<V_stretch_half_suffix>"
   [(set (match_operand:<V_stretch_half> 0 "register_operand" "=f")
@@ -403,12 +435,11 @@ 
 ;; Shuffle halfwords.
 (define_insn "loongson_pshufh"
   [(set (match_operand:VH 0 "register_operand" "=f")
-        (unspec:VH [(match_operand:VH 1 "register_operand" "0")
-		    (match_operand:VH 2 "register_operand" "f")
-		    (match_operand:SI 3 "register_operand" "f")]
+        (unspec:VH [(match_operand:VH 1 "register_operand" "f")
+		    (match_operand:SI 2 "register_operand" "f")]
 		   UNSPEC_LOONGSON_PSHUFH))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "pshufh\t%0,%2,%3"
+  "pshufh\t%0,%1,%2"
   [(set_attr "type" "fmul")])
 
 ;; Shift left logical.
@@ -478,26 +509,95 @@ 
   "psubus<V_suffix>\t%0,%1,%2"
   [(set_attr "type" "fadd")])
 
-;; Unpack high data.
-(define_insn "vec_interleave_high<mode>"
-  [(set (match_operand:VWHB 0 "register_operand" "=f")
-        (unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
-		      (match_operand:VWHB 2 "register_operand" "f")]
-		     UNSPEC_LOONGSON_PUNPCKH))]
+;; Unpack high data.  Recall that Loongson only runs in little-endian.
+(define_insn "loongson_punpckhbh"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+	(vec_select:V8QI
+	  (vec_concat:V16QI
+	    (match_operand:V8QI 1 "register_operand" "f")
+	    (match_operand:V8QI 2 "register_operand" "f"))
+	  (parallel [(const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpckhbh\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_punpckhhw"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "f")
+	    (match_operand:V4HI 2 "register_operand" "f"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "punpckh<V_stretch_half_suffix>\t%0,%1,%2"
+  "punpckhhw\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_punpckhwd"
+  [(set (match_operand:V2SI 0 "register_operand" "=f")
+	(vec_select:V2SI
+	  (vec_concat:V4SI
+	    (match_operand:V2SI 1 "register_operand" "f")
+	    (match_operand:V2SI 2 "register_operand" "f"))
+	  (parallel [(const_int 1) (const_int 3)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpckhwd\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
 ;; Unpack low data.
-(define_insn "vec_interleave_low<mode>"
-  [(set (match_operand:VWHB 0 "register_operand" "=f")
-        (unspec:VWHB [(match_operand:VWHB 1 "register_operand" "f")
-		      (match_operand:VWHB 2 "register_operand" "f")]
-		     UNSPEC_LOONGSON_PUNPCKL))]
+(define_insn "loongson_punpcklbh"
+  [(set (match_operand:V8QI 0 "register_operand" "=f")
+	(vec_select:V8QI
+	  (vec_concat:V16QI
+	    (match_operand:V8QI 1 "register_operand" "f")
+	    (match_operand:V8QI 2 "register_operand" "f"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpcklbh\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_punpcklhw"
+  [(set (match_operand:V4HI 0 "register_operand" "=f")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "f")
+	    (match_operand:V4HI 2 "register_operand" "f"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+  "punpcklhw\t%0,%1,%2"
+  [(set_attr "type" "fdiv")])
+
+(define_insn "loongson_punpcklwd"
+  [(set (match_operand:V2SI 0 "register_operand" "=f")
+	(vec_select:V2SI
+	  (vec_concat:V4SI
+	    (match_operand:V2SI 1 "register_operand" "f")
+	    (match_operand:V2SI 2 "register_operand" "f"))
+	  (parallel [(const_int 0) (const_int 2)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-  "punpckl<V_stretch_half_suffix>\t%0,%1,%2"
+  "punpcklwd\t%0,%1,%2"
   [(set_attr "type" "fdiv")])
 
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:VWHB 0 "register_operand" "")
+   (match_operand:VWHB 1 "register_operand" "")
+   (match_operand:VWHB 2 "register_operand" "")
+   (match_operand:VWHB 3 "" "")]
+  "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
+{
+  if (mips_expand_vec_perm_const (operands))
+    DONE;
+  else
+    FAIL;
+})
+
 ;; Integer division and modulus.  For integer multiplication, see mips.md.
 
 (define_insn "<u>div<mode>3"
diff --git a/gcc/config/mips/mips-modes.def b/gcc/config/mips/mips-modes.def
index b9c508b..85861a9 100644
--- a/gcc/config/mips/mips-modes.def
+++ b/gcc/config/mips/mips-modes.def
@@ -26,9 +26,15 @@  RESET_FLOAT_FORMAT (DF, mips_double_format);
 FLOAT_MODE (TF, 16, mips_quad_format);
 
 /* Vector modes.  */
-VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
-VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
-VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
+VECTOR_MODES (INT, 8);        /*       V8QI  V4HI V2SI */
+VECTOR_MODES (FLOAT, 8);      /*             V4HF V2SF */
+VECTOR_MODES (INT, 4);        /*             V4QI V2HI */
+
+/* Double-sized vector modes for vec_concat.  */
+VECTOR_MODE (INT, QI, 16);
+VECTOR_MODE (INT, HI, 8);
+VECTOR_MODE (INT, SI, 4);
+VECTOR_MODE (FLOAT, SF, 4);
 
 VECTOR_MODES (FRACT, 4);	/* V4QQ  V2HQ */
 VECTOR_MODES (UFRACT, 4);	/* V4UQQ V2UHQ */
diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index dbabdff..37c958d 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -328,6 +328,7 @@  extern void mips_expand_atomic_qihi (union mips_gen_fn_ptrs,
 				     rtx, rtx, rtx, rtx);
 
 extern void mips_expand_vector_init (rtx, rtx);
+extern bool mips_expand_vec_perm_const (rtx op[4]);
 
 extern bool mips_eh_uses (unsigned int);
 extern bool mips_epilogue_uses (unsigned int);
diff --git a/gcc/config/mips/mips-ps-3d.md b/gcc/config/mips/mips-ps-3d.md
index 504f43c..fbbb7b0 100644
--- a/gcc/config/mips/mips-ps-3d.md
+++ b/gcc/config/mips/mips-ps-3d.md
@@ -89,62 +89,170 @@ 
   DONE;
 })
 
-; pul.ps - Pair Upper Lower
-(define_insn "mips_pul_ps"
+(define_insn "vec_perm_const_ps"
   [(set (match_operand:V2SF 0 "register_operand" "=f")
-	(vec_merge:V2SF
-	 (match_operand:V2SF 1 "register_operand" "f")
-	 (match_operand:V2SF 2 "register_operand" "f")
-	 (const_int 2)))]
+	(vec_select:V2SF
+	  (vec_concat:V4SF
+	    (match_operand:V2SF 1 "register_operand" "f")
+	    (match_operand:V2SF 2 "register_operand" "f"))
+	  (parallel [(match_operand:SI 3 "const_0_or_1_operand" "")
+		     (match_operand:SI 4 "const_2_or_3_operand" "")])))]
   "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "pul.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+{
+  /* Let <op>L be the lower part of operand <op> and <op>U be the upper part.
+     The P[UL][UL].PS instruction always specifies the upper part of the
+     result first, so the instruction is:
 
-; puu.ps - Pair upper upper
-(define_insn "mips_puu_ps"
-  [(set (match_operand:V2SF 0 "register_operand" "=f")
-	(vec_merge:V2SF
-	 (match_operand:V2SF 1 "register_operand" "f")
-	 (vec_select:V2SF (match_operand:V2SF 2 "register_operand" "f")
-			  (parallel [(const_int 1)
-				     (const_int 0)]))
-	 (const_int 2)))]
-  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "puu.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+	P<aUL><bUL>.PS %0,<aop>,<bop>
 
-; pll.ps - Pair Lower Lower
-(define_insn "mips_pll_ps"
-  [(set (match_operand:V2SF 0 "register_operand" "=f")
-	(vec_merge:V2SF
-	 (vec_select:V2SF (match_operand:V2SF 1 "register_operand" "f")
-			  (parallel [(const_int 1)
-				     (const_int 0)]))
-	 (match_operand:V2SF 2 "register_operand" "f")
-	 (const_int 2)))]
-  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "pll.ps\t%0,%1,%2"
-  [(set_attr "type" "fmove")
-   (set_attr "mode" "SF")])
+     where 0U == <aop><aUL> and 0L == <bop><bUL>.
 
-; plu.ps - Pair Lower Upper
-(define_insn "mips_plu_ps"
-  [(set (match_operand:V2SF 0 "register_operand" "=f")
-	(vec_merge:V2SF
-	 (vec_select:V2SF (match_operand:V2SF 1 "register_operand" "f")
-			  (parallel [(const_int 1)
-				     (const_int 0)]))
-	 (vec_select:V2SF (match_operand:V2SF 2 "register_operand" "f")
-			  (parallel [(const_int 1)
-				     (const_int 0)]))
-	 (const_int 2)))]
-  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
-  "plu.ps\t%0,%1,%2"
+     GCC's vector indices are specified in memory order, which means
+     that vector element 0 is the lower part (L) on little-endian targets
+     and the upper part (U) on big-endian targets.  vec_concat likewise
+     concatenates in memory order, which means that operand 3 (being
+     0 or 1) selects part of operand 1 and operand 4 (being 2 or 3)
+     selects part of operand 2.
+
+     Let:
+
+	I3 = INTVAL (operands[3])
+	I4 = INTVAL (operands[4]) - 2
+
+     Taking the two endiannesses in turn:
+
+     Little-endian:
+
+        The semantics of the RTL pattern are:
+
+	{ 0L, 0U } = { X[I3], X[I4 + 2] }, where X = { 1L, 1U, 2L, 2U }
+
+	so: 0L = { 1L, 1U }[I3] (= <bop><bUL>)
+	    0U = { 2L, 2U }[I4] (= <aop><aUL>)
+
+	    <aop> = 2, <aUL> = I4 ? U : L
+	    <bop> = 1, <bUL> = I3 ? U : L
+
+	    [LL] !I4 && !I3   [UL] I4 && !I3
+	    [LU] !I4 && I3    [UU] I4 && I3
+
+     Big-endian:
+
+        The semantics of the RTL pattern are:
+
+	{ 0U, 0L } = { X[I3], X[I4 + 2] }, where X = { 1U, 1L, 2U, 2L }
+
+	so: 0U = { 1U, 1L }[I3] (= <aop><aUL>)
+	    0L = { 2U, 2L }[I4] (= <bop><bUL>)
+
+	    <aop> = 1, <aUL> = I3 ? L : U
+	    <bop> = 2, <bUL> = I4 ? L : U
+
+	    [UU] !I3 && !I4   [UL] !I3 && I4
+	    [LU] I3 && !I4    [LL] I3 && I4.  */
+
+  static const char * const mnemonics[2][4] = {
+    /* LE */ { "pll.ps\t%0,%2,%1", "pul.ps\t%0,%2,%1",
+	       "plu.ps\t%0,%2,%1", "puu.ps\t%0,%2,%1" },
+    /* BE */ { "puu.ps\t%0,%1,%2", "pul.ps\t%0,%1,%2",
+	       "plu.ps\t%0,%1,%2", "pll.ps\t%0,%1,%2" },
+  };
+
+  unsigned mask = INTVAL (operands[3]) * 2 + (INTVAL (operands[4]) - 2);
+  return mnemonics[BYTES_BIG_ENDIAN][mask];
+}
   [(set_attr "type" "fmove")
    (set_attr "mode" "SF")])
 
+(define_expand "vec_perm_constv2sf"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")
+   (match_operand:V2SI 3 "" "")]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  if (mips_expand_vec_perm_const (operands))
+    DONE;
+  else
+    FAIL;
+})
+
+;; Expanders for builtins.  The instruction:
+;;
+;;     P[UL][UL].PS <result>, <a>, <b>
+;;
+;; says that the upper part of <result> is taken from half of <a> and
+;; the lower part of <result> is taken from half of <b>.  This means
+;; that the P[UL][UL].PS operand order matches memory order on big-endian
+;; targets; <a> is element 0 of the V2SF result while <b> is element 1.
+;; However, the P[UL][UL].PS operand order is the reverse of memory order
+;; on little-endian targets; <a> is element 1 of the V2SF result while
+;; <b> is element 0.  The arguments to vec_perm_const_ps are always in
+;; memory order.
+;;
+;; Similarly, "U" corresponds to element 0 on big-endian targets but
+;; to element 1 on little-endian targets.
+
+(define_expand "mips_puu_ps"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
+				      const0_rtx, const2_rtx));
+  else
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
+				      const1_rtx, GEN_INT (3)));
+  DONE;
+})
+
+(define_expand "mips_pul_ps"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
+				      const0_rtx, GEN_INT (3)));
+  else
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
+				      const0_rtx, GEN_INT (3)));
+  DONE;
+})
+
+(define_expand "mips_plu_ps"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
+				      const1_rtx, const2_rtx));
+  else
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
+				      const1_rtx, const2_rtx));
+  DONE;
+})
+
+(define_expand "mips_pll_ps"
+  [(match_operand:V2SF 0 "register_operand" "")
+   (match_operand:V2SF 1 "register_operand" "")
+   (match_operand:V2SF 2 "register_operand" "")]
+  "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
+{
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[1], operands[2],
+				      const1_rtx, GEN_INT (3)));
+  else
+    emit_insn (gen_vec_perm_const_ps (operands[0], operands[2], operands[1],
+				      const0_rtx, const2_rtx));
+  DONE;
+})
+
 ; vec_init
 (define_expand "vec_initv2sf"
   [(match_operand:V2SF 0 "register_operand")
@@ -195,22 +303,21 @@ 
 ;; no other way to get a vector mode bitfield store currently.
 
 (define_expand "vec_setv2sf"
-  [(match_operand:V2SF 0 "register_operand")
-   (match_operand:SF 1 "register_operand")
-   (match_operand 2 "const_0_or_1_operand")]
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(vec_select:V2SF
+	  (vec_concat:V4SF
+	    (match_operand:SF 1 "register_operand" "")
+	    (match_dup 0))
+	  (parallel [(match_operand 2 "const_0_or_1_operand" "")
+		     (match_dup 3)])))]
   "TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT"
 {
-  rtx temp;
-
   /* We don't have an insert instruction, so we duplicate the float, and
      then use a PUL instruction.  */
-  temp = gen_reg_rtx (V2SFmode);
+  rtx temp = gen_reg_rtx (V2SFmode);
   emit_insn (gen_mips_cvt_ps_s (temp, operands[1], operands[1]));
-  if (INTVAL (operands[2]) == !BYTES_BIG_ENDIAN)
-    emit_insn (gen_mips_pul_ps (operands[0], temp, operands[0]));
-  else
-    emit_insn (gen_mips_pul_ps (operands[0], operands[0], temp));
-  DONE;
+  operands[1] = temp;
+  operands[3] = GEN_INT (1 - INTVAL (operands[2]) + 2);
 })
 
 ; cvt.ps.s - Floating Point Convert Pair to Paired Single
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 84d4f8b..a1f06d4 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -12792,12 +12792,6 @@  AVAIL_NON_MIPS16 (cache, TARGET_CACHE_BUILTIN)
 #define CODE_FOR_loongson_psubsb CODE_FOR_sssubv8qi3
 #define CODE_FOR_loongson_psubush CODE_FOR_ussubv4hi3
 #define CODE_FOR_loongson_psubusb CODE_FOR_ussubv8qi3
-#define CODE_FOR_loongson_punpckhbh CODE_FOR_vec_interleave_highv8qi
-#define CODE_FOR_loongson_punpckhhw CODE_FOR_vec_interleave_highv4hi
-#define CODE_FOR_loongson_punpckhwd CODE_FOR_vec_interleave_highv2si
-#define CODE_FOR_loongson_punpcklbh CODE_FOR_vec_interleave_lowv8qi
-#define CODE_FOR_loongson_punpcklhw CODE_FOR_vec_interleave_lowv4hi
-#define CODE_FOR_loongson_punpcklwd CODE_FOR_vec_interleave_lowv2si
 
 static const struct mips_builtin_description mips_builtins[] = {
   DIRECT_BUILTIN (pll_ps, MIPS_V2SF_FTYPE_V2SF_V2SF, paired_single),
@@ -13039,8 +13033,8 @@  static const struct mips_builtin_description mips_builtins[] = {
   LOONGSON_BUILTIN (pasubub, MIPS_UV8QI_FTYPE_UV8QI_UV8QI),
   LOONGSON_BUILTIN (biadd, MIPS_UV4HI_FTYPE_UV8QI),
   LOONGSON_BUILTIN (psadbh, MIPS_UV4HI_FTYPE_UV8QI_UV8QI),
-  LOONGSON_BUILTIN_SUFFIX (pshufh, u, MIPS_UV4HI_FTYPE_UV4HI_UV4HI_UQI),
-  LOONGSON_BUILTIN_SUFFIX (pshufh, s, MIPS_V4HI_FTYPE_V4HI_V4HI_UQI),
+  LOONGSON_BUILTIN_SUFFIX (pshufh, u, MIPS_UV4HI_FTYPE_UV4HI_UQI),
+  LOONGSON_BUILTIN_SUFFIX (pshufh, s, MIPS_V4HI_FTYPE_V4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllh, u, MIPS_UV4HI_FTYPE_UV4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllh, s, MIPS_V4HI_FTYPE_V4HI_UQI),
   LOONGSON_BUILTIN_SUFFIX (psllw, u, MIPS_UV2SI_FTYPE_UV2SI_UQI),
@@ -16344,6 +16338,221 @@  mips_shift_truncation_mask (enum machine_mode mode)
 }
 
 
+/* Generate or test for an insn that supports a constant permutation.  */
+
+#define MAX_VECT_LEN 8
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
+
+/* Construct (set target (vec_select op0 (parallel perm))) and
+   return true if that's a valid instruction in the active ISA.  */
+
+static bool
+expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
+{
+  rtx rperm[MAX_VECT_LEN], x;
+  unsigned i;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (perm[i]);
+
+  x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
+  x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
+  x = gen_rtx_SET (VOIDmode, target, x);
+
+  x = emit_insn (x);
+  if (recog_memoized (x) < 0)
+    {
+      remove_insn (x);
+      return false;
+    }
+  return true;
+}
+
+/* Similar, but generate a vec_concat from op0 and op1 as well.  */
+
+static bool
+expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+			const unsigned char *perm, unsigned nelt)
+{
+  enum machine_mode v2mode;
+  rtx x;
+
+  v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
+  x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
+  return expand_vselect (target, x, perm, nelt);
+}
+
+/* Recognize patterns for the Loongson PSHUFH instruction.  */
+
+static bool
+mips_expand_vpc_loongson_pshufh (struct expand_vec_perm_d *d)
+{
+  unsigned i, mask;
+
+  if (!(TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS))
+    return false;
+  if (d->vmode != V4HImode)
+    return false;
+  if (!d->one_vector_p)
+    return false;
+  if (d->testing_p)
+    return true;
+
+  /* Convert the selector into the packed 8-bit form for pshufh.  */
+  /* Recall that loongson is little-endian only.  No big-endian
+     adjustment required.  */
+  for (i = mask = 0; i < 4; i++)
+    mask |= (d->perm[i] & 3) << (i * 2);
+
+  emit_insn (gen_loongson_pshufh (d->target, d->op0,
+				  force_reg (SImode, GEN_INT (mask))));
+  return true;
+}
+
+static bool
+mips_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  unsigned int i, nelt = d->nelt;
+  unsigned char perm2[MAX_VECT_LEN];
+
+  if (d->one_vector_p)
+    {
+      /* Try interleave with alternating operands.  */
+      memcpy (perm2, d->perm, sizeof(perm2));
+      for (i = 1; i < nelt; i += 2)
+	perm2[i] += nelt;
+      if (expand_vselect_vconcat (d->target, d->op0, d->op1, perm2, nelt))
+	return true;
+    }
+  else
+    {
+      if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
+	return true;
+
+      /* Try again with swapped operands.  */
+      for (i = 0; i < nelt; ++i)
+	perm2[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
+      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
+	return true;
+    }
+
+  if (mips_expand_vpc_loongson_pshufh (d))
+    return true;
+  return false;
+}
+
+/* Expand a vec_perm_const pattern.  */
+
+bool
+mips_expand_vec_perm_const (rtx operands[4])
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+  rtx sel;
+
+  d.target = operands[0];
+  d.op0 = operands[1];
+  d.op1 = operands[2];
+  sel = operands[3];
+
+  d.vmode = GET_MODE (d.target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (d.op0, d.op1))
+	break;
+
+      /* The backend (vec_select (vec_concat)) patterns are not duplicated
+	 for single-operand.  Try once with the original un-folded selector. */
+      if (mips_expand_vec_perm_const_1 (&d))
+	return true;
+
+      /* Try again after folding the selector to a single operand.  */
+      /* FALLTHRU */
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] &= nelt - 1;
+      d.op0 = d.op1;
+      d.one_vector_p = true;
+      break;
+
+    case 1:
+      d.op1 = d.op0;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return mips_expand_vec_perm_const_1 (&d);
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
+
+static bool
+mips_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+				  const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Categorize the set of elements in the selector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_vector_p = (which != 3);
+
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_vector_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = mips_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -16562,6 +16771,9 @@  mips_shift_truncation_mask (enum machine_mode mode)
 #undef TARGET_SHIFT_TRUNCATION_MASK
 #define TARGET_SHIFT_TRUNCATION_MASK mips_shift_truncation_mask
 
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK mips_vectorize_vec_perm_const_ok
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-mips.h"
diff --git a/gcc/config/mips/predicates.md b/gcc/config/mips/predicates.md
index 5e9398e..b611373 100644
--- a/gcc/config/mips/predicates.md
+++ b/gcc/config/mips/predicates.md
@@ -73,8 +73,15 @@ 
 ;; This is used for indexing into vectors, and hence only accepts const_int.
 (define_predicate "const_0_or_1_operand"
   (and (match_code "const_int")
-       (ior (match_test "op == CONST0_RTX (GET_MODE (op))")
-	    (match_test "op == CONST1_RTX (GET_MODE (op))"))))
+       (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
+(define_predicate "const_2_or_3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 2, 3)")))
+
+(define_predicate "const_0_to_3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 3)")))
 
 (define_predicate "qi_mask_operand"
   (and (match_code "const_int")