From patchwork Fri Jan 7 23:01:13 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Henderson X-Patchwork-Id: 77930 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id B7B23B715A for ; Sat, 8 Jan 2011 10:03:01 +1100 (EST) Received: (qmail 30656 invoked by alias); 7 Jan 2011 23:02:59 -0000 Received: (qmail 30645 invoked by uid 22791); 7 Jan 2011 23:02:54 -0000 X-SWARE-Spam-Status: No, hits=-4.7 required=5.0 tests=AWL, BAYES_50, RCVD_IN_DNSWL_HI, SPF_HELO_PASS, T_RP_MATCHES_RCVD X-Spam-Check-By: sourceware.org Received: from mx1.redhat.com (HELO mx1.redhat.com) (209.132.183.28) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Fri, 07 Jan 2011 23:02:45 +0000 Received: from int-mx02.intmail.prod.int.phx2.redhat.com (int-mx02.intmail.prod.int.phx2.redhat.com [10.5.11.12]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id p07N1EcD015018 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Fri, 7 Jan 2011 18:01:14 -0500 Received: from anchor.twiddle.home (ovpn-113-93.phx2.redhat.com [10.3.113.93]) by int-mx02.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id p07N1DOv025883; Fri, 7 Jan 2011 18:01:13 -0500 Message-ID: <4D279B39.2090404@redhat.com> Date: Fri, 07 Jan 2011 15:01:13 -0800 From: Richard Henderson User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101209 Fedora/3.1.7-0.35.b3pre.fc14 Thunderbird/3.1.7 MIME-Version: 1.0 To: sje@cup.hp.com CC: gcc-patches@gcc.gnu.org Subject: Re: [ia64, rfa] vector pattern improvements References: <201101061725.p06HPCX21825@lucas.cup.hp.com> <4D260FB1.80800@redhat.com> <1294345320.21802.38.camel@hpsje.cup.hp.com> In-Reply-To: <1294345320.21802.38.camel@hpsje.cup.hp.com> X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org On 01/06/2011 12:22 PM, Steve Ellcey wrote: > If swapping arguments doesn't work for all patterns then I think I would > rather use the predicate method in all cases instead of having some use > the select*parallel predicate and others swapping arguments. It turns out that swapping arguments + opcodes does work for all patterns. There's a fair amount of symmetry in the isa, thankfully. I'd like to take a moment to thank the ia64 isa designers for their lack of consistency in whether r2 or r3 refers to the right or left operand, insofar as the description of the operation is graphically pictured in the isa manual. It makes verifying whether the arguments have been properly ordered for big/little endian so much more fun. Anyway, this passed vect.exp on linux earlier; I'm running a full test now. If you could set up an hpux test run for this evening, that would be great. r~ Index: gcc/config/ia64/predicates.md =================================================================== --- gcc/config/ia64/predicates.md (revision 168549) +++ gcc/config/ia64/predicates.md (working copy) @@ -624,3 +624,7 @@ return REG_P (op) && REG_POINTER (op); }) +;; True if this is the right-most vector element; for mux1 @brcst. +(define_predicate "mux1_brcst_element" + (and (match_code "const_int") + (match_test "INTVAL (op) == (TARGET_BIG_ENDIAN ? 7 : 0)"))) Index: gcc/config/ia64/ia64.c =================================================================== --- gcc/config/ia64/ia64.c (revision 168549) +++ gcc/config/ia64/ia64.c (working copy) @@ -1972,12 +1972,13 @@ return true; } -/* Emit an integral vector unpack operation. */ +/* The vectors LO and HI each contain N halves of a double-wide vector. + Reassemble either the first N/2 or the second N/2 elements. */ void -ia64_expand_unpack (rtx operands[3], bool unsignedp, bool highp) +ia64_unpack_assemble (rtx out, rtx lo, rtx hi, bool highp) { - enum machine_mode mode = GET_MODE (operands[1]); + enum machine_mode mode = GET_MODE (lo); rtx (*gen) (rtx, rtx, rtx); rtx x; @@ -1993,96 +1994,66 @@ gcc_unreachable (); } - /* Fill in x with the sign extension of each element in op1. */ - if (unsignedp) - x = CONST0_RTX (mode); + x = gen_lowpart (mode, out); + if (TARGET_BIG_ENDIAN) + x = gen (x, hi, lo); else - { - bool neg; - - x = gen_reg_rtx (mode); - - neg = ia64_expand_vecint_compare (LT, mode, x, operands[1], - CONST0_RTX (mode)); - gcc_assert (!neg); - } - - emit_insn (gen (gen_lowpart (mode, operands[0]), operands[1], x)); + x = gen (x, lo, hi); + emit_insn (x); } -/* Emit an integral vector widening sum operations. */ +/* Return a vector of the sign-extension of VEC. */ -void -ia64_expand_widen_sum (rtx operands[3], bool unsignedp) +static rtx +ia64_unpack_sign (rtx vec, bool unsignedp) { - rtx l, h, x, s; - enum machine_mode wmode, mode; - rtx (*unpack_l) (rtx, rtx, rtx); - rtx (*unpack_h) (rtx, rtx, rtx); - rtx (*plus) (rtx, rtx, rtx); + enum machine_mode mode = GET_MODE (vec); + rtx zero = CONST0_RTX (mode); - wmode = GET_MODE (operands[0]); - mode = GET_MODE (operands[1]); - - switch (mode) - { - case V8QImode: - unpack_l = gen_vec_interleave_lowv8qi; - unpack_h = gen_vec_interleave_highv8qi; - plus = gen_addv4hi3; - break; - case V4HImode: - unpack_l = gen_vec_interleave_lowv4hi; - unpack_h = gen_vec_interleave_highv4hi; - plus = gen_addv2si3; - break; - default: - gcc_unreachable (); - } - - /* Fill in x with the sign extension of each element in op1. */ if (unsignedp) - x = CONST0_RTX (mode); + return zero; else { + rtx sign = gen_reg_rtx (mode); bool neg; - x = gen_reg_rtx (mode); - - neg = ia64_expand_vecint_compare (LT, mode, x, operands[1], - CONST0_RTX (mode)); + neg = ia64_expand_vecint_compare (LT, mode, sign, vec, zero); gcc_assert (!neg); + + return sign; } +} - l = gen_reg_rtx (wmode); - h = gen_reg_rtx (wmode); - s = gen_reg_rtx (wmode); +/* Emit an integral vector unpack operation. */ - emit_insn (unpack_l (gen_lowpart (mode, l), operands[1], x)); - emit_insn (unpack_h (gen_lowpart (mode, h), operands[1], x)); - emit_insn (plus (s, l, operands[2])); - emit_insn (plus (operands[0], h, s)); +void +ia64_expand_unpack (rtx operands[3], bool unsignedp, bool highp) +{ + rtx sign = ia64_unpack_sign (operands[1], unsignedp); + ia64_unpack_assemble (operands[0], operands[1], sign, highp); } +/* Emit an integral vector widening sum operations. */ + void -ia64_expand_widen_mul_v4hi (rtx operands[3], bool unsignedp, bool highp) +ia64_expand_widen_sum (rtx operands[3], bool unsignedp) { - rtx l = gen_reg_rtx (V4HImode); - rtx h = gen_reg_rtx (V4HImode); - rtx (*mulhigh)(rtx, rtx, rtx, rtx); - rtx (*interl)(rtx, rtx, rtx); + enum machine_mode wmode; + rtx l, h, t, sign; - emit_insn (gen_mulv4hi3 (l, operands[1], operands[2])); + sign = ia64_unpack_sign (operands[1], unsignedp); - /* For signed, pmpy2.r would appear to more closely match this operation. - However, the vectorizer is more likely to use the LO and HI patterns - in pairs. At which point, with this formulation, the first two insns - of each can be CSEd. */ - mulhigh = unsignedp ? gen_pmpyshr2_u : gen_pmpyshr2; - emit_insn (mulhigh (h, operands[1], operands[2], GEN_INT (16))); + wmode = GET_MODE (operands[0]); + l = gen_reg_rtx (wmode); + h = gen_reg_rtx (wmode); - interl = highp ? gen_vec_interleave_highv4hi : gen_vec_interleave_lowv4hi; - emit_insn (interl (gen_lowpart (V4HImode, operands[0]), l, h)); + ia64_unpack_assemble (l, operands[1], sign, false); + ia64_unpack_assemble (h, operands[1], sign, true); + + t = expand_binop (wmode, add_optab, l, operands[2], NULL, 0, OPTAB_DIRECT); + t = expand_binop (wmode, add_optab, h, t, operands[0], 0, OPTAB_DIRECT); + if (t != operands[0]) + emit_move_insn (operands[0], t); } /* Emit a signed or unsigned V8QI dot product operation. */ @@ -2090,48 +2061,31 @@ void ia64_expand_dot_prod_v8qi (rtx operands[4], bool unsignedp) { - rtx l1, l2, h1, h2, x1, x2, p1, p2, p3, p4, s1, s2, s3; + rtx op1, op2, sn1, sn2, l1, l2, h1, h2; + rtx p1, p2, p3, p4, s1, s2, s3; - /* Fill in x1 and x2 with the sign extension of each element. */ - if (unsignedp) - x1 = x2 = CONST0_RTX (V8QImode); - else - { - bool neg; + op1 = operands[1]; + op2 = operands[2]; + sn1 = ia64_unpack_sign (op1, unsignedp); + sn2 = ia64_unpack_sign (op2, unsignedp); - x1 = gen_reg_rtx (V8QImode); - x2 = gen_reg_rtx (V8QImode); - - neg = ia64_expand_vecint_compare (LT, V8QImode, x1, operands[1], - CONST0_RTX (V8QImode)); - gcc_assert (!neg); - neg = ia64_expand_vecint_compare (LT, V8QImode, x2, operands[2], - CONST0_RTX (V8QImode)); - gcc_assert (!neg); - } - l1 = gen_reg_rtx (V4HImode); l2 = gen_reg_rtx (V4HImode); h1 = gen_reg_rtx (V4HImode); h2 = gen_reg_rtx (V4HImode); + ia64_unpack_assemble (l1, op1, sn1, false); + ia64_unpack_assemble (l2, op2, sn2, false); + ia64_unpack_assemble (h1, op1, sn1, true); + ia64_unpack_assemble (h2, op2, sn2, true); - emit_insn (gen_vec_interleave_lowv8qi - (gen_lowpart (V8QImode, l1), operands[1], x1)); - emit_insn (gen_vec_interleave_lowv8qi - (gen_lowpart (V8QImode, l2), operands[2], x2)); - emit_insn (gen_vec_interleave_highv8qi - (gen_lowpart (V8QImode, h1), operands[1], x1)); - emit_insn (gen_vec_interleave_highv8qi - (gen_lowpart (V8QImode, h2), operands[2], x2)); - p1 = gen_reg_rtx (V2SImode); p2 = gen_reg_rtx (V2SImode); p3 = gen_reg_rtx (V2SImode); p4 = gen_reg_rtx (V2SImode); - emit_insn (gen_pmpy2_r (p1, l1, l2)); - emit_insn (gen_pmpy2_l (p2, l1, l2)); - emit_insn (gen_pmpy2_r (p3, h1, h2)); - emit_insn (gen_pmpy2_l (p4, h1, h2)); + emit_insn (gen_pmpy2_even (p1, l1, l2)); + emit_insn (gen_pmpy2_even (p2, h1, h2)); + emit_insn (gen_pmpy2_odd (p3, l1, l2)); + emit_insn (gen_pmpy2_odd (p4, h1, h2)); s1 = gen_reg_rtx (V2SImode); s2 = gen_reg_rtx (V2SImode); Index: gcc/config/ia64/vect.md =================================================================== --- gcc/config/ia64/vect.md (revision 168549) +++ gcc/config/ia64/vect.md (working copy) @@ -172,35 +172,11 @@ (match_operand:V8QI 2 "gr_register_operand" "r")))] "" { - rtx r1, l1, r2, l2, rm, lm; - - r1 = gen_reg_rtx (V4HImode); - l1 = gen_reg_rtx (V4HImode); - r2 = gen_reg_rtx (V4HImode); - l2 = gen_reg_rtx (V4HImode); - - /* Zero-extend the QImode elements into two words of HImode elements - by interleaving them with zero bytes. */ - emit_insn (gen_mix1_r (gen_lowpart (V8QImode, r1), - operands[1], CONST0_RTX (V8QImode))); - emit_insn (gen_mix1_r (gen_lowpart (V8QImode, r2), - operands[2], CONST0_RTX (V8QImode))); - emit_insn (gen_mix1_l (gen_lowpart (V8QImode, l1), - operands[1], CONST0_RTX (V8QImode))); - emit_insn (gen_mix1_l (gen_lowpart (V8QImode, l2), - operands[2], CONST0_RTX (V8QImode))); - - /* Multiply. */ - rm = gen_reg_rtx (V4HImode); - lm = gen_reg_rtx (V4HImode); - emit_insn (gen_mulv4hi3 (rm, r1, r2)); - emit_insn (gen_mulv4hi3 (lm, l1, l2)); - - /* Zap the high order bytes of the HImode elements by overwriting those - in one part with the low order bytes of the other. */ - emit_insn (gen_mix1_r (operands[0], - gen_lowpart (V8QImode, rm), - gen_lowpart (V8QImode, lm))); + rtx l = gen_reg_rtx (V4HImode); + rtx h = gen_reg_rtx (V4HImode); + emit_insn (gen_vec_widen_umult_lo_v8qi (l, operands[1], operands[2])); + emit_insn (gen_vec_widen_umult_hi_v8qi (h, operands[1], operands[2])); + emit_insn (gen_vec_pack_trunc_v4hi (operands[0], l, h)); DONE; }) @@ -296,7 +272,7 @@ "pmpyshr2.u %0 = %1, %2, %3" [(set_attr "itanium_class" "mmmul")]) -(define_insn "pmpy2_r" +(define_insn "pmpy2_even" [(set (match_operand:V2SI 0 "gr_register_operand" "=r") (mult:V2SI (vec_select:V2SI @@ -308,10 +284,16 @@ (match_operand:V4HI 2 "gr_register_operand" "r")) (parallel [(const_int 0) (const_int 2)]))))] "" - "pmpy2.r %0 = %1, %2" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,pmpy2.l %0 = %1, %2"; + else + return "%,pmpy2.r %0 = %1, %2"; +} [(set_attr "itanium_class" "mmshf")]) -(define_insn "pmpy2_l" +(define_insn "pmpy2_odd" [(set (match_operand:V2SI 0 "gr_register_operand" "=r") (mult:V2SI (vec_select:V2SI @@ -323,7 +305,13 @@ (match_operand:V4HI 2 "gr_register_operand" "r")) (parallel [(const_int 1) (const_int 3)]))))] "" - "pmpy2.l %0 = %1, %2" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,pmpy2.r %0 = %1, %2"; + else + return "%,pmpy2.l %0 = %1, %2"; +} [(set_attr "itanium_class" "mmshf")]) (define_expand "vec_widen_smult_lo_v4hi" @@ -332,7 +320,11 @@ (match_operand:V4HI 2 "gr_register_operand" "")] "" { - ia64_expand_widen_mul_v4hi (operands, false, false); + rtx l = gen_reg_rtx (V4HImode); + rtx h = gen_reg_rtx (V4HImode); + emit_insn (gen_mulv4hi3 (l, operands[1], operands[2])); + emit_insn (gen_pmpyshr2 (h, operands[1], operands[2], GEN_INT (16))); + ia64_unpack_assemble (operands[0], l, h, false); DONE; }) @@ -342,7 +334,11 @@ (match_operand:V4HI 2 "gr_register_operand" "")] "" { - ia64_expand_widen_mul_v4hi (operands, false, true); + rtx l = gen_reg_rtx (V4HImode); + rtx h = gen_reg_rtx (V4HImode); + emit_insn (gen_mulv4hi3 (l, operands[1], operands[2])); + emit_insn (gen_pmpyshr2 (h, operands[1], operands[2], GEN_INT (16))); + ia64_unpack_assemble (operands[0], l, h, true); DONE; }) @@ -352,7 +348,11 @@ (match_operand:V4HI 2 "gr_register_operand" "")] "" { - ia64_expand_widen_mul_v4hi (operands, true, false); + rtx l = gen_reg_rtx (V4HImode); + rtx h = gen_reg_rtx (V4HImode); + emit_insn (gen_mulv4hi3 (l, operands[1], operands[2])); + emit_insn (gen_pmpyshr2_u (h, operands[1], operands[2], GEN_INT (16))); + ia64_unpack_assemble (operands[0], l, h, false); DONE; }) @@ -362,7 +362,11 @@ (match_operand:V4HI 2 "gr_register_operand" "")] "" { - ia64_expand_widen_mul_v4hi (operands, true, true); + rtx l = gen_reg_rtx (V4HImode); + rtx h = gen_reg_rtx (V4HImode); + emit_insn (gen_mulv4hi3 (l, operands[1], operands[2])); + emit_insn (gen_pmpyshr2_u (h, operands[1], operands[2], GEN_INT (16))); + ia64_unpack_assemble (operands[0], l, h, true); DONE; }) @@ -405,15 +409,28 @@ emit_insn (gen_mulv4hi3 (t3, t0, op2h)); /* T4 = CY.h, CY.l, AW.h, AW.l = CY, AW. */ - emit_insn (gen_mix2_r (gen_lowpart (V4HImode, t4), t1, t2)); + x = gen_lowpart (V4HImode, t4); + if (TARGET_BIG_ENDIAN) + x = gen_mix2_odd (x, t2, t1); + else + x = gen_mix2_even (x, t1, t2); + emit_insn (x); /* T5 = CZ.l, 0, AX.l, 0 = CZ << 16, AX << 16. */ - emit_insn (gen_mix2_l (gen_lowpart (V4HImode, t5), - CONST0_RTX (V4HImode), t3)); + x = gen_lowpart (V4HImode, t5); + if (TARGET_BIG_ENDIAN) + x = gen_mix2_even (x, t3, CONST0_RTX (V4HImode)); + else + x = gen_mix2_odd (x, CONST0_RTX (V4HImode), t3); + emit_insn (x); /* T6 = DY.l, 0, BW.l, 0 = DY << 16, BW << 16. */ - emit_insn (gen_mix2_r (gen_lowpart (V4HImode, t6), - CONST0_RTX (V4HImode), t3)); + x = gen_lowpart (V4HImode, t6); + if (TARGET_BIG_ENDIAN) + x = gen_mix2_odd (x, t3, CONST0_RTX (V4HImode)); + else + x = gen_mix2_even (x, CONST0_RTX (V4HImode), t3); + emit_insn (x); emit_insn (gen_addv2si3 (t7, t4, t5)); emit_insn (gen_addv2si3 (operands[0], t6, t7)); @@ -608,16 +625,36 @@ (match_operand:V2SI 3 "gr_register_operand" "")] "" { - rtx l, r, t; + rtx e, o, t; - r = gen_reg_rtx (V2SImode); + e = gen_reg_rtx (V2SImode); + o = gen_reg_rtx (V2SImode); + t = gen_reg_rtx (V2SImode); + + emit_insn (gen_pmpy2_even (e, operands[1], operands[2])); + emit_insn (gen_pmpy2_odd (o, operands[1], operands[2])); + emit_insn (gen_addv2si3 (t, e, operands[3])); + emit_insn (gen_addv2si3 (operands[0], t, o)); + DONE; +}) + +(define_expand "udot_prodv4hi" + [(match_operand:V2SI 0 "gr_register_operand" "") + (match_operand:V4HI 1 "gr_register_operand" "") + (match_operand:V4HI 2 "gr_register_operand" "") + (match_operand:V2SI 3 "gr_register_operand" "")] + "" +{ + rtx l, h, t; + l = gen_reg_rtx (V2SImode); + h = gen_reg_rtx (V2SImode); t = gen_reg_rtx (V2SImode); - emit_insn (gen_pmpy2_r (r, operands[1], operands[2])); - emit_insn (gen_pmpy2_l (l, operands[1], operands[2])); - emit_insn (gen_addv2si3 (t, r, operands[3])); - emit_insn (gen_addv2si3 (operands[0], t, l)); + emit_insn (gen_vec_widen_umult_lo_v4hi (l, operands[1], operands[2])); + emit_insn (gen_vec_widen_umult_hi_v4hi (h, operands[1], operands[2])); + emit_insn (gen_addv2si3 (t, l, operands[3])); + emit_insn (gen_addv2si3 (operands[0], t, h)); DONE; }) @@ -673,7 +710,13 @@ (ss_truncate:V4QI (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))))] "" - "pack2.sss %0 = %r1, %r2" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,pack2.sss %0 = %r2, %r1"; + else + return "%,pack2.sss %0 = %r1, %r2"; +} [(set_attr "itanium_class" "mmshf")]) (define_insn "vec_pack_usat_v4hi" @@ -684,7 +727,13 @@ (us_truncate:V4QI (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU"))))] "" - "pack2.uss %0 = %r1, %r2" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,pack2.uss %0 = %r2, %r1"; + else + return "%,pack2.uss %0 = %r1, %r2"; +} [(set_attr "itanium_class" "mmshf")]) (define_insn "vec_pack_ssat_v2si" @@ -695,7 +744,13 @@ (ss_truncate:V2HI (match_operand:V2SI 2 "gr_reg_or_0_operand" "rU"))))] "" - "pack4.sss %0 = %r1, %r2" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,pack4.sss %0 = %r2, %r1"; + else + return "%,pack4.sss %0 = %r1, %r2"; +} [(set_attr "itanium_class" "mmshf")]) (define_insn "vec_interleave_lowv8qi" @@ -709,7 +764,13 @@ (const_int 2) (const_int 10) (const_int 3) (const_int 11)])))] "" - "unpack1.l %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,unpack1.h %0 = %r1, %r2"; + else + return "%,unpack1.l %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) (define_insn "vec_interleave_highv8qi" @@ -723,57 +784,63 @@ (const_int 6) (const_int 14) (const_int 7) (const_int 15)])))] "" - "unpack1.h %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,unpack1.l %0 = %r1, %r2"; + else + return "%,unpack1.h %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) -(define_insn "mix1_r" +(define_insn "mix1_even" [(set (match_operand:V8QI 0 "gr_register_operand" "=r") (vec_select:V8QI (vec_concat:V16QI (match_operand:V8QI 1 "gr_reg_or_0_operand" "rU") (match_operand:V8QI 2 "gr_reg_or_0_operand" "rU")) - (parallel [(const_int 0) - (const_int 8) - (const_int 2) - (const_int 10) - (const_int 4) - (const_int 12) - (const_int 6) - (const_int 14)])))] + (parallel [(const_int 0) (const_int 8) + (const_int 2) (const_int 10) + (const_int 4) (const_int 12) + (const_int 6) (const_int 14)])))] "" - "mix1.r %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,mix1.l %0 = %r1, %r2"; + else + return "%,mix1.r %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) -(define_insn "mix1_l" +(define_insn "mix1_odd" [(set (match_operand:V8QI 0 "gr_register_operand" "=r") (vec_select:V8QI (vec_concat:V16QI (match_operand:V8QI 1 "gr_reg_or_0_operand" "rU") (match_operand:V8QI 2 "gr_reg_or_0_operand" "rU")) - (parallel [(const_int 1) - (const_int 9) - (const_int 3) - (const_int 11) - (const_int 5) - (const_int 13) - (const_int 7) - (const_int 15)])))] + (parallel [(const_int 1) (const_int 9) + (const_int 3) (const_int 11) + (const_int 5) (const_int 13) + (const_int 7) (const_int 15)])))] "" - "mix1.l %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,mix1.r %0 = %r1, %r2"; + else + return "%,mix1.l %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) (define_insn "*mux1_rev" [(set (match_operand:V8QI 0 "gr_register_operand" "=r") (vec_select:V8QI (match_operand:V8QI 1 "gr_register_operand" "r") - (parallel [(const_int 7) - (const_int 6) - (const_int 5) - (const_int 4) - (const_int 3) - (const_int 2) - (const_int 1) - (const_int 0)])))] + (parallel [(const_int 7) (const_int 6) + (const_int 5) (const_int 4) + (const_int 3) (const_int 2) + (const_int 1) (const_int 0)])))] "" "mux1 %0 = %1, @rev" [(set_attr "itanium_class" "mmshf")]) @@ -782,14 +849,10 @@ [(set (match_operand:V8QI 0 "gr_register_operand" "=r") (vec_select:V8QI (match_operand:V8QI 1 "gr_register_operand" "r") - (parallel [(const_int 0) - (const_int 4) - (const_int 2) - (const_int 6) - (const_int 1) - (const_int 5) - (const_int 3) - (const_int 7)])))] + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6) + (const_int 1) (const_int 5) + (const_int 3) (const_int 7)])))] "" "mux1 %0 = %1, @mix" [(set_attr "itanium_class" "mmshf")]) @@ -798,14 +861,10 @@ [(set (match_operand:V8QI 0 "gr_register_operand" "=r") (vec_select:V8QI (match_operand:V8QI 1 "gr_register_operand" "r") - (parallel [(const_int 0) - (const_int 4) - (const_int 1) - (const_int 5) - (const_int 2) - (const_int 6) - (const_int 3) - (const_int 7)])))] + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5) + (const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] "" "mux1 %0 = %1, @shuf" [(set_attr "itanium_class" "mmshf")]) @@ -814,14 +873,10 @@ [(set (match_operand:V8QI 0 "gr_register_operand" "=r") (vec_select:V8QI (match_operand:V8QI 1 "gr_register_operand" "r") - (parallel [(const_int 0) - (const_int 2) - (const_int 4) - (const_int 6) - (const_int 1) - (const_int 3) - (const_int 5) - (const_int 7)])))] + (parallel [(const_int 0) (const_int 2) + (const_int 4) (const_int 6) + (const_int 1) (const_int 3) + (const_int 5) (const_int 7)])))] "" "mux1 %0 = %1, @alt" [(set_attr "itanium_class" "mmshf")]) @@ -830,14 +885,14 @@ [(set (match_operand:V8QI 0 "gr_register_operand" "=r") (vec_select:V8QI (match_operand:V8QI 1 "gr_register_operand" "r") - (parallel [(const_int 0) - (const_int 0) - (const_int 0) - (const_int 0) - (const_int 0) - (const_int 0) - (const_int 0) - (const_int 0)])))] + (parallel [(match_operand 2 "mux1_brcst_element" "") + (match_dup 2) + (match_dup 2) + (match_dup 2) + (match_dup 2) + (match_dup 2) + (match_dup 2) + (match_dup 2)])))] "" "mux1 %0 = %1, @brcst" [(set_attr "itanium_class" "mmshf")]) @@ -857,7 +912,7 @@ "" { rtx temp = gen_reg_rtx (V8QImode); - emit_insn (gen_mix1_r (temp, operands[1], operands[2])); + emit_insn (gen_mix1_even (temp, operands[1], operands[2])); emit_insn (gen_mux1_alt (operands[0], temp)); DONE; }) @@ -869,7 +924,7 @@ "" { rtx temp = gen_reg_rtx (V8QImode); - emit_insn (gen_mix1_l (temp, operands[1], operands[2])); + emit_insn (gen_mix1_odd (temp, operands[1], operands[2])); emit_insn (gen_mux1_alt (operands[0], temp)); DONE; }) @@ -880,12 +935,16 @@ (vec_concat:V8HI (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU") (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU")) - (parallel [(const_int 0) - (const_int 4) - (const_int 1) - (const_int 5)])))] + (parallel [(const_int 0) (const_int 4) + (const_int 1) (const_int 5)])))] "" - "unpack2.l %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,unpack2.h %0 = %r1, %r2"; + else + return "%,unpack2.l %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) (define_insn "vec_interleave_highv4hi" @@ -894,40 +953,52 @@ (vec_concat:V8HI (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU") (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU")) - (parallel [(const_int 2) - (const_int 6) - (const_int 3) - (const_int 7)])))] + (parallel [(const_int 2) (const_int 6) + (const_int 3) (const_int 7)])))] "" - "unpack2.h %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,unpack2.l %0 = %r1, %r2"; + else + return "%,unpack2.h %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) -(define_insn "mix2_r" +(define_insn "mix2_even" [(set (match_operand:V4HI 0 "gr_register_operand" "=r") (vec_select:V4HI (vec_concat:V8HI (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU") (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU")) - (parallel [(const_int 0) - (const_int 4) - (const_int 2) - (const_int 6)])))] + (parallel [(const_int 0) (const_int 4) + (const_int 2) (const_int 6)])))] "" - "mix2.r %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,mix2.l %0 = %r1, %r0"; + else + return "%,mix2.r %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) -(define_insn "mix2_l" +(define_insn "mix2_odd" [(set (match_operand:V4HI 0 "gr_register_operand" "=r") (vec_select:V4HI (vec_concat:V8HI (match_operand:V4HI 1 "gr_reg_or_0_operand" "rU") (match_operand:V4HI 2 "gr_reg_or_0_operand" "rU")) - (parallel [(const_int 1) - (const_int 5) - (const_int 3) - (const_int 7)])))] + (parallel [(const_int 1) (const_int 5) + (const_int 3) (const_int 7)])))] "" - "mix2.l %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,mix2.r %0 = %r1, %r0"; + else + return "%,mix2.l %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) (define_insn "*mux2" @@ -940,11 +1011,21 @@ (match_operand 5 "const_int_2bit_operand" "")])))] "" { - int mask; - mask = INTVAL (operands[2]); - mask |= INTVAL (operands[3]) << 2; - mask |= INTVAL (operands[4]) << 4; - mask |= INTVAL (operands[5]) << 6; + int mask = 0; + if (TARGET_BIG_ENDIAN) + { + mask |= (3 - INTVAL (operands[2])) << 6; + mask |= (3 - INTVAL (operands[3])) << 4; + mask |= (3 - INTVAL (operands[4])) << 2; + mask |= 3 - INTVAL (operands[5]); + } + else + { + mask |= INTVAL (operands[2]); + mask |= INTVAL (operands[3]) << 2; + mask |= INTVAL (operands[4]) << 4; + mask |= INTVAL (operands[5]) << 6; + } operands[2] = GEN_INT (mask); return "%,mux2 %0 = %1, %2"; } @@ -954,10 +1035,8 @@ [(set (match_operand:V4HI 0 "gr_register_operand" "") (vec_select:V4HI (match_operand:V4HI 1 "gr_register_operand" "") - (parallel [(const_int 0) - (const_int 2) - (const_int 1) - (const_int 3)])))] + (parallel [(const_int 0) (const_int 2) + (const_int 1) (const_int 3)])))] "") (define_expand "vec_extract_evenv4hi" @@ -967,7 +1046,7 @@ "" { rtx temp = gen_reg_rtx (V4HImode); - emit_insn (gen_mix2_r (temp, operands[1], operands[2])); + emit_insn (gen_mix2_even (temp, operands[1], operands[2])); emit_insn (gen_vec_extract_evenodd_helper (operands[0], temp)); DONE; }) @@ -979,7 +1058,7 @@ "" { rtx temp = gen_reg_rtx (V4HImode); - emit_insn (gen_mix2_l (temp, operands[1], operands[2])); + emit_insn (gen_mix2_odd (temp, operands[1], operands[2])); emit_insn (gen_vec_extract_evenodd_helper (operands[0], temp)); DONE; }) @@ -992,30 +1071,38 @@ "mux2 %0 = %1, 0" [(set_attr "itanium_class" "mmshf")]) -;; Note that mix4.r performs the exact same operation. (define_insn "vec_interleave_lowv2si" [(set (match_operand:V2SI 0 "gr_register_operand" "=r") (vec_select:V2SI (vec_concat:V4SI (match_operand:V2SI 1 "gr_reg_or_0_operand" "rU") (match_operand:V2SI 2 "gr_reg_or_0_operand" "rU")) - (parallel [(const_int 0) - (const_int 2)])))] + (parallel [(const_int 0) (const_int 2)])))] "" - "unpack4.l %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,unpack4.h %0 = %r1, %r2"; + else + return "%,unpack4.l %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) -;; Note that mix4.l performs the exact same operation. (define_insn "vec_interleave_highv2si" [(set (match_operand:V2SI 0 "gr_register_operand" "=r") (vec_select:V2SI (vec_concat:V4SI (match_operand:V2SI 1 "gr_reg_or_0_operand" "rU") (match_operand:V2SI 2 "gr_reg_or_0_operand" "rU")) - (parallel [(const_int 1) - (const_int 3)])))] + (parallel [(const_int 1) (const_int 3)])))] "" - "unpack4.h %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,unpack4.l %0 = %r1, %r2"; + else + return "%,unpack4.h %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) (define_expand "vec_extract_evenv2si" @@ -1061,10 +1148,7 @@ if (!gr_reg_or_0_operand (op2, SImode)) op2 = force_reg (SImode, op2); - if (TARGET_BIG_ENDIAN) - x = gen_rtx_VEC_CONCAT (V2SImode, op2, op1); - else - x = gen_rtx_VEC_CONCAT (V2SImode, op1, op2); + x = gen_rtx_VEC_CONCAT (V2SImode, op1, op2); emit_insn (gen_rtx_SET (VOIDmode, operands[0], x)); DONE; }) @@ -1075,7 +1159,13 @@ (match_operand:SI 1 "gr_reg_or_0_operand" "rO") (match_operand:SI 2 "gr_reg_or_0_operand" "rO")))] "" - "unpack4.l %0 = %r2, %r1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,unpack4.l %0 = %r1, %r2"; + else + return "%,unpack4.l %0 = %r2, %r1"; +} [(set_attr "itanium_class" "mmshf")]) ;; Missing operations @@ -1333,10 +1423,7 @@ if (!fr_reg_or_fp01_operand (op2, SFmode)) op2 = force_reg (SFmode, op2); - if (TARGET_BIG_ENDIAN) - emit_insn (gen_fpack (operands[0], op2, op1)); - else - emit_insn (gen_fpack (operands[0], op1, op2)); + emit_insn (gen_fpack (operands[0], op1, op2)); DONE; }) @@ -1346,7 +1433,13 @@ (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG") (match_operand:SF 2 "fr_reg_or_fp01_operand" "fG")))] "" - "fpack %0 = %F2, %F1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,fpack %0 = %F1, %F2"; + else + return "%,fpack %0 = %F2, %F1"; +} [(set_attr "itanium_class" "fmisc")]) (define_insn "fswap" @@ -1357,7 +1450,13 @@ (match_operand:V2SF 2 "fr_reg_or_0_operand" "fU")) (parallel [(const_int 1) (const_int 2)])))] "" - "fswap %0 = %F1, %F2" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,fswap %0 = %F2, %F1"; + else + return "%,fswap %0 = %F1, %F2"; +} [(set_attr "itanium_class" "fmisc")]) (define_insn "vec_interleave_highv2sf" @@ -1368,7 +1467,13 @@ (match_operand:V2SF 2 "fr_reg_or_0_operand" "fU")) (parallel [(const_int 1) (const_int 3)])))] "" - "fmix.l %0 = %F2, %F1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,fmix.r %0 = %F1, %F2"; + else + return "%,fmix.l %0 = %F2, %F1"; +} [(set_attr "itanium_class" "fmisc")]) (define_insn "vec_interleave_lowv2sf" @@ -1379,7 +1484,13 @@ (match_operand:V2SF 2 "fr_reg_or_0_operand" "fU")) (parallel [(const_int 0) (const_int 2)])))] "" - "fmix.r %0 = %F2, %F1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,fmix.l %0 = %F1, %F2"; + else + return "%,fmix.r %0 = %F2, %F1"; +} [(set_attr "itanium_class" "fmisc")]) (define_insn "fmix_lr" @@ -1390,7 +1501,13 @@ (match_operand:V2SF 2 "fr_reg_or_0_operand" "fU")) (parallel [(const_int 0) (const_int 3)])))] "" - "fmix.lr %0 = %F2, %F1" +{ + /* Recall that vector elements are numbered in memory order. */ + if (TARGET_BIG_ENDIAN) + return "%,fmix.lr %0 = %F1, %F0"; + else + return "%,fmix.lr %0 = %F2, %F1"; +} [(set_attr "itanium_class" "fmisc")]) (define_expand "vec_extract_evenv2sf" @@ -1415,23 +1532,24 @@ DONE; }) - (define_expand "vec_setv2sf" - [(match_operand:V2SF 0 "fr_register_operand" "") - (match_operand:SF 1 "fr_register_operand" "") + [(match_operand:V2SF 0 "register_operand" "") + (match_operand:SF 1 "register_operand" "") (match_operand 2 "const_int_operand" "")] "" { + rtx op0 = operands[0]; rtx tmp = gen_reg_rtx (V2SFmode); + emit_insn (gen_fpack (tmp, operands[1], CONST0_RTX (SFmode))); switch (INTVAL (operands[2])) { case 0: - emit_insn (gen_fmix_lr (operands[0], tmp, operands[0])); + emit_insn (gen_fmix_lr (op0, tmp, op0)); break; case 1: - emit_insn (gen_vec_interleave_lowv2sf (operands[0], operands[0], tmp)); + emit_insn (gen_vec_interleave_lowv2sf (op0, op0, tmp)); break; default: gcc_unreachable (); @@ -1458,8 +1576,8 @@ }) (define_insn_and_split "*vec_extractv2sf_0_be" - [(set (match_operand:SF 0 "register_operand" "=r,f") - (unspec:SF [(match_operand:V2SF 1 "register_operand" "rf,r") + [(set (match_operand:SF 0 "register_operand" "=rf,r") + (unspec:SF [(match_operand:V2SF 1 "nonimmediate_operand" "m,r") (const_int 0)] UNSPEC_VECT_EXTR))] "TARGET_BIG_ENDIAN" @@ -1467,31 +1585,44 @@ "reload_completed" [(set (match_dup 0) (match_dup 1))] { - if (REG_P (operands[1]) && FR_REGNO_P (REGNO (operands[1]))) - operands[0] = gen_rtx_REG (V2SFmode, REGNO (operands[0])); + if (MEM_P (operands[1])) + operands[1] = adjust_address (operands[1], SFmode, 0); else - operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1])); + { + emit_insn (gen_lshrdi3 (operands[0], operands[1], GEN_INT (32))); + DONE; + } }) -(define_insn_and_split "*vec_extractv2sf_1" +(define_insn_and_split "*vec_extractv2sf_1_le" [(set (match_operand:SF 0 "register_operand" "=r") (unspec:SF [(match_operand:V2SF 1 "register_operand" "r") (const_int 1)] UNSPEC_VECT_EXTR))] - "" + "!TARGET_BIG_ENDIAN" "#" - "reload_completed" + "&& reload_completed" [(const_int 0)] { operands[0] = gen_rtx_REG (DImode, REGNO (operands[0])); operands[1] = gen_rtx_REG (DImode, REGNO (operands[1])); - if (TARGET_BIG_ENDIAN) - emit_move_insn (operands[0], operands[1]); - else - emit_insn (gen_lshrdi3 (operands[0], operands[1], GEN_INT (32))); + emit_insn (gen_lshrdi3 (operands[0], operands[1], GEN_INT (32))); DONE; }) +(define_insn_and_split "*vec_extractv2sf_1_be" + [(set (match_operand:SF 0 "register_operand" "=rf") + (unspec:SF [(match_operand:V2SF 1 "register_operand" "r") + (const_int 1)] + UNSPEC_VECT_EXTR))] + "TARGET_BIG_ENDIAN" + "#" + "&& reload_completed" + [(set (match_dup 0) (match_dup 1))] +{ + operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1])); +}) + (define_expand "vec_extractv2sf" [(set (match_operand:SF 0 "register_operand" "") (unspec:SF [(match_operand:V2SF 1 "register_operand" "") @@ -1542,9 +1673,12 @@ (match_operand:V4HI 2 "gr_register_operand" "")] "" { - rtx op1 = gen_lowpart(V8QImode, operands[1]); - rtx op2 = gen_lowpart(V8QImode, operands[2]); - emit_insn (gen_vec_extract_evenv8qi (operands[0], op1, op2)); + rtx op1 = gen_lowpart (V8QImode, operands[1]); + rtx op2 = gen_lowpart (V8QImode, operands[2]); + if (TARGET_BIG_ENDIAN) + emit_insn (gen_vec_extract_oddv8qi (operands[0], op1, op2)); + else + emit_insn (gen_vec_extract_evenv8qi (operands[0], op1, op2)); DONE; }) @@ -1554,9 +1688,12 @@ (match_operand:V2SI 2 "gr_register_operand" "")] "" { - rtx op1 = gen_lowpart(V4HImode, operands[1]); - rtx op2 = gen_lowpart(V4HImode, operands[2]); - emit_insn (gen_vec_extract_evenv4hi (operands[0], op1, op2)); + rtx op1 = gen_lowpart (V4HImode, operands[1]); + rtx op2 = gen_lowpart (V4HImode, operands[2]); + if (TARGET_BIG_ENDIAN) + emit_insn (gen_vec_extract_oddv4hi (operands[0], op1, op2)); + else + emit_insn (gen_vec_extract_evenv4hi (operands[0], op1, op2)); DONE; }) Index: gcc/config/ia64/ia64-protos.h =================================================================== --- gcc/config/ia64/ia64-protos.h (revision 168549) +++ gcc/config/ia64/ia64-protos.h (working copy) @@ -39,6 +39,7 @@ extern void ia64_expand_compare (rtx *, rtx *, rtx *); extern void ia64_expand_vecint_cmov (rtx[]); extern bool ia64_expand_vecint_minmax (enum rtx_code, enum machine_mode, rtx[]); +extern void ia64_unpack_assemble (rtx, rtx, rtx, bool); extern void ia64_expand_unpack (rtx [], bool, bool); extern void ia64_expand_widen_sum (rtx[], bool); extern void ia64_expand_widen_mul_v4hi (rtx [], bool, bool);