@@ -5161,6 +5161,18 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
else
unpack = gen_sse4_1_sign_extendv2siv2di2;
break;
+ case E_V8QImode:
+ if (unsigned_p)
+ unpack = gen_sse4_1_zero_extendv4qiv4hi2;
+ else
+ unpack = gen_sse4_1_sign_extendv4qiv4hi2;
+ break;
+ case E_V4HImode:
+ if (unsigned_p)
+ unpack = gen_sse4_1_zero_extendv2hiv2si2;
+ else
+ unpack = gen_sse4_1_sign_extendv2hiv2si2;
+ break;
default:
gcc_unreachable ();
}
@@ -5172,10 +5184,24 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
}
else if (high_p)
{
- /* Shift higher 8 bytes to lower 8 bytes. */
- tmp = gen_reg_rtx (V1TImode);
- emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
- GEN_INT (64)));
+ switch (GET_MODE_SIZE (imode))
+ {
+ case 16:
+ /* Shift higher 8 bytes to lower 8 bytes. */
+ tmp = gen_reg_rtx (V1TImode);
+ emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+ GEN_INT (64)));
+ break;
+ case 8:
+ /* Shift higher 4 bytes to lower 4 bytes. */
+ tmp = gen_reg_rtx (V1DImode);
+ emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
+ GEN_INT (32)));
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
tmp = gen_lowpart (imode, tmp);
}
else
@@ -5207,6 +5233,18 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
else
unpack = gen_vec_interleave_lowv4si;
break;
+ case E_V8QImode:
+ if (high_p)
+ unpack = gen_mmx_punpckhbw;
+ else
+ unpack = gen_mmx_punpcklbw;
+ break;
+ case E_V4HImode:
+ if (high_p)
+ unpack = gen_mmx_punpckhwd;
+ else
+ unpack = gen_mmx_punpcklwd;
+ break;
default:
gcc_unreachable ();
}
@@ -1000,6 +1000,9 @@ (define_code_iterator any_truncate [ss_truncate truncate us_truncate])
(define_code_attr trunsuffix
[(ss_truncate "s") (truncate "") (us_truncate "us")])
+;; Instruction suffix for SSE sign and zero extensions.
+(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
+
;; Used in signed and unsigned fix.
(define_code_iterator any_fix [fix unsigned_fix])
(define_code_attr fixsuffix [(fix "") (unsigned_fix "u")])
@@ -2639,6 +2639,78 @@ (define_insn_and_split "mmx_punpckldq"
(set_attr "type" "mmxcvt,sselog,sselog")
(set_attr "mode" "DI,TI,TI")])
+(define_insn "sse4_1_<code>v4qiv4hi2"
+ [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw")
+ (any_extend:V4HI
+ (vec_select:V4QI
+ (match_operand:V8QI 1 "register_operand" "Yr,*x,Yw")
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)]))))]
+ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+ "%vpmov<extsuffix>bw\t{%1, %0|%0, %1}"
+ [(set_attr "isa" "noavx,noavx,avx")
+ (set_attr "type" "ssemov")
+ (set_attr "prefix_extra" "1")
+ (set_attr "prefix" "orig,orig,maybe_evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_<code>v2hiv2si2"
+ [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v")
+ (any_extend:V2SI
+ (vec_select:V2HI
+ (match_operand:V4HI 1 "register_operand" "Yr,*x,v")
+ (parallel [(const_int 0) (const_int 1)]))))]
+ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+ "%vpmov<extsuffix>wd\t{%1, %0|%0, %1}"
+ [(set_attr "isa" "noavx,noavx,avx")
+ (set_attr "type" "ssemov")
+ (set_attr "prefix_extra" "1")
+ (set_attr "prefix" "orig,orig,maybe_evex")
+ (set_attr "mode" "TI")])
+
+;; Pack/unpack vector modes
+(define_mode_attr mmxpackmode
+ [(V4HI "V8QI") (V2SI "V4HI")])
+
+(define_expand "vec_pack_trunc_<mode>"
+ [(match_operand:<mmxpackmode> 0 "register_operand")
+ (match_operand:MMXMODE24 1 "register_operand")
+ (match_operand:MMXMODE24 2 "register_operand")]
+ "TARGET_MMX_WITH_SSE"
+{
+ rtx op1 = gen_lowpart (<mmxpackmode>mode, operands[1]);
+ rtx op2 = gen_lowpart (<mmxpackmode>mode, operands[2]);
+ ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0);
+ DONE;
+})
+
+(define_mode_attr mmxunpackmode
+ [(V8QI "V4HI") (V4HI "V2SI")])
+
+(define_expand "vec_unpacks_lo_<mode>"
+ [(match_operand:<mmxunpackmode> 0 "register_operand")
+ (match_operand:MMXMODE12 1 "register_operand")]
+ "TARGET_MMX_WITH_SSE"
+ "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;")
+
+(define_expand "vec_unpacks_hi_<mode>"
+ [(match_operand:<mmxunpackmode> 0 "register_operand")
+ (match_operand:MMXMODE12 1 "register_operand")]
+ "TARGET_MMX_WITH_SSE"
+ "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;")
+
+(define_expand "vec_unpacku_lo_<mode>"
+ [(match_operand:<mmxunpackmode> 0 "register_operand")
+ (match_operand:MMXMODE12 1 "register_operand")]
+ "TARGET_MMX_WITH_SSE"
+ "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;")
+
+(define_expand "vec_unpacku_hi_<mode>"
+ [(match_operand:<mmxunpackmode> 0 "register_operand")
+ (match_operand:MMXMODE12 1 "register_operand")]
+ "TARGET_MMX_WITH_SSE"
+ "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;")
+
(define_insn "*mmx_pinsrd"
[(set (match_operand:V2SI 0 "register_operand" "=x,Yv")
(vec_merge:V2SI
@@ -976,9 +976,6 @@ (define_mode_attr castmode
[(V8SI "si") (V8SF "ps") (V4DF "pd")
(V16SI "si") (V16SF "ps") (V8DF "pd")])
-;; Instruction suffix for sign and zero extensions.
-(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
-
;; i128 for integer vectors and TARGET_AVX2, f128 otherwise.
;; i64x4 or f64x4 for 512bit modes.
(define_mode_attr i128
@@ -8,23 +8,26 @@
void
foo (unsigned char* p1, unsigned char* p2, short* __restrict p3)
{
- for (int i = 0 ; i != 8; i++)
- p3[i] = p1[i] + p2[i];
- return;
+ /* Avoid loop vectorization. */
+#pragma GCC unroll 8
+ for (int i = 0 ; i != 8; i++)
+ p3[i] = p1[i] + p2[i];
}
void
foo1 (unsigned short* p1, unsigned short* p2, int* __restrict p3)
{
- for (int i = 0 ; i != 4; i++)
- p3[i] = p1[i] + p2[i];
- return;
+ /* Avoid loop vectorization. */
+#pragma GCC unroll 4
+ for (int i = 0 ; i != 4; i++)
+ p3[i] = p1[i] + p2[i];
}
void
foo2 (unsigned int* p1, unsigned int* p2, long long* __restrict p3)
{
- for (int i = 0 ; i != 2; i++)
- p3[i] = (long long)p1[i] + (long long)p2[i];
- return;
+ /* Avoid loop vectorization. */
+#pragma GCC unroll 2
+ for (int i = 0 ; i != 2; i++)
+ p3[i] = (long long)p1[i] + (long long)p2[i];
}