Message ID | 20141017122800.GA49545@msticlxl57.ims.intel.com |
---|---|
State | New |
Headers | show |
On Fri, Oct 17, 2014 at 04:28:12PM +0400, Kirill Yukhin wrote: > > I wonder whether for these modes it can ever be beneficial to build them > > through interleaves/concatenations etc., if it wouldn't be better to build > > them by storing all values into memory and just reading it back. > I've tried this example: > #include <immintrin.h> > > unsigned char a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, > a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, > a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40, a41, a42, a43, a44, > a45, a46, a47, a48, a49, a50, a51, a52, a53, a54, a55, a56, a57, a58, a59, > a60, a61, a62, a63; > > __m512i foo () > { > return __extension__ (__m512i)(__v64qi){ > a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, > a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, > a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40, a41, a42, a43, a44, > a45, a46, a47, a48, a49, a50, a51, a52, a53, a54, a55, a56, a57, a58, a59, > a60, a61, a62, a63 }; > } > > w/ and w/o -mavx512bw (and always -mavx512f). > > When, this code works, we've got 127 lines of assembly to do this init. > W/o AVX-512BW we've got > 300 lines of code (mostly on GPRs, using sal, and etc.) > > Then I've looked into actual assembly w/ -mavx512bw and it turns out that no > AVX-512BW insn were generated, only AVX-512F (and below). Fixed iterator. Ok, if it is shorter than copying all those into memory and reading from memory, so be it. > > > -(define_mode_iterator VI48F_512 [V16SI V16SF V8DI V8DF]) > > > +(define_mode_iterator VI48F_I12_AVX512BW > > > + [V16SI V16SF V8DI V8DF > > > + (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")]) > > > > What does the I12 stand for? Wasn't it meant to be VI48F_512_AVX512BW > > or I512? > Actually, I am not awere of any name convention for iterators. > As far as I understand, name [more or less] for vector mode > should reflect: > - Type family of the unit: float or int > - Size of the unit: 1, 2, 4 etc. bytes > - If possible, target predicates to enable certain modes in > given iterator. > > The name is: > - Vector (V) > - I48F - contains both ints and floats of size 4 and 8 > - I12 - contains ints of size 1 and 2 > - AVX512BW - affected by the target (according to previous note - to be removed) > > Maybe it'll be better to name it: VF48_I1248? I'll leave that to Uros, the patch is ok by me. Jakub
On Fri, Oct 17, 2014 at 2:57 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Fri, Oct 17, 2014 at 04:28:12PM +0400, Kirill Yukhin wrote: >> > I wonder whether for these modes it can ever be beneficial to build them >> > through interleaves/concatenations etc., if it wouldn't be better to build >> > them by storing all values into memory and just reading it back. >> I've tried this example: >> #include <immintrin.h> >> >> unsigned char a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, >> a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, >> a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40, a41, a42, a43, a44, >> a45, a46, a47, a48, a49, a50, a51, a52, a53, a54, a55, a56, a57, a58, a59, >> a60, a61, a62, a63; >> >> __m512i foo () >> { >> return __extension__ (__m512i)(__v64qi){ >> a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, >> a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, >> a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40, a41, a42, a43, a44, >> a45, a46, a47, a48, a49, a50, a51, a52, a53, a54, a55, a56, a57, a58, a59, >> a60, a61, a62, a63 }; >> } >> >> w/ and w/o -mavx512bw (and always -mavx512f). >> >> When, this code works, we've got 127 lines of assembly to do this init. >> W/o AVX-512BW we've got > 300 lines of code (mostly on GPRs, using sal, and etc.) >> >> Then I've looked into actual assembly w/ -mavx512bw and it turns out that no >> AVX-512BW insn were generated, only AVX-512F (and below). Fixed iterator. > > Ok, if it is shorter than copying all those into memory and reading from > memory, so be it. > >> > > -(define_mode_iterator VI48F_512 [V16SI V16SF V8DI V8DF]) >> > > +(define_mode_iterator VI48F_I12_AVX512BW >> > > + [V16SI V16SF V8DI V8DF >> > > + (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")]) >> > >> > What does the I12 stand for? Wasn't it meant to be VI48F_512_AVX512BW >> > or I512? >> Actually, I am not awere of any name convention for iterators. >> As far as I understand, name [more or less] for vector mode >> should reflect: >> - Type family of the unit: float or int >> - Size of the unit: 1, 2, 4 etc. bytes >> - If possible, target predicates to enable certain modes in >> given iterator. >> >> The name is: >> - Vector (V) >> - I48F - contains both ints and floats of size 4 and 8 >> - I12 - contains ints of size 1 and 2 >> - AVX512BW - affected by the target (according to previous note - to be removed) >> >> Maybe it'll be better to name it: VF48_I1248? > > I'll leave that to Uros, the patch is ok by me. Don't want to bikeshed, but VF48_I1248 looks somehow better to me. Anyway, the patch is OK even without this change. Thanks, Uros.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index baf0d3d..c3202c4 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -39760,6 +39760,8 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, case V8SFmode: case V8SImode: case V2DFmode: + case V64QImode: + case V32HImode: case V2DImode: case V4SFmode: case V4SImode: @@ -39790,6 +39792,9 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, goto widen; case V8HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + return ix86_vector_duplicate_value (mode, target, val); + if (TARGET_SSE2) { struct expand_vec_perm_d dperm; @@ -39820,6 +39825,9 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, goto widen; case V16QImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + return ix86_vector_duplicate_value (mode, target, val); + if (TARGET_SSE2) goto permute; goto widen; @@ -39849,16 +39857,19 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, case V16HImode: case V32QImode: - { - enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode); - rtx x = gen_reg_rtx (hvmode); + if (TARGET_AVX512VL && TARGET_AVX512BW) + return ix86_vector_duplicate_value (mode, target, val); + else + { + enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode); + rtx x = gen_reg_rtx (hvmode); - ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); - gcc_assert (ok); + ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); + gcc_assert (ok); - x = gen_rtx_VEC_CONCAT (mode, x, x); - emit_insn (gen_rtx_SET (VOIDmode, target, x)); - } + x = gen_rtx_VEC_CONCAT (mode, x, x); + emit_insn (gen_rtx_SET (VOIDmode, target, x)); + } return true; default: @@ -40420,8 +40431,9 @@ static void ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode, rtx target, rtx vals) { - rtx ops[64], op0, op1; + rtx ops[64], op0, op1, op2, op3, op4, op5; enum machine_mode half_mode = VOIDmode; + enum machine_mode quarter_mode = VOIDmode; int n, i; switch (mode) @@ -40472,6 +40484,42 @@ half: gen_rtx_VEC_CONCAT (mode, op0, op1))); return; + case V64QImode: + quarter_mode = V16QImode; + half_mode = V32QImode; + goto quarter; + + case V32HImode: + quarter_mode = V8HImode; + half_mode = V16HImode; + goto quarter; + +quarter: + n = GET_MODE_NUNITS (mode); + for (i = 0; i < n; i++) + ops[i] = XVECEXP (vals, 0, i); + op0 = gen_reg_rtx (quarter_mode); + op1 = gen_reg_rtx (quarter_mode); + op2 = gen_reg_rtx (quarter_mode); + op3 = gen_reg_rtx (quarter_mode); + op4 = gen_reg_rtx (half_mode); + op5 = gen_reg_rtx (half_mode); + ix86_expand_vector_init_interleave (quarter_mode, op0, ops, + n >> 3); + ix86_expand_vector_init_interleave (quarter_mode, op1, + &ops [n >> 2], n >> 3); + ix86_expand_vector_init_interleave (quarter_mode, op2, + &ops [n >> 1], n >> 3); + ix86_expand_vector_init_interleave (quarter_mode, op3, + &ops [(n >> 1) | (n >> 2)], n >> 3); + emit_insn (gen_rtx_SET (VOIDmode, op4, + gen_rtx_VEC_CONCAT (half_mode, op0, op1))); + emit_insn (gen_rtx_SET (VOIDmode, op5, + gen_rtx_VEC_CONCAT (half_mode, op2, op3))); + emit_insn (gen_rtx_SET (VOIDmode, target, + gen_rtx_VEC_CONCAT (mode, op4, op5))); + return; + case V16QImode: if (!TARGET_SSE4_1) break; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index dcb53df..4dfdb69 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -524,7 +524,8 @@ (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F") (V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")]) -(define_mode_iterator VI48F_512 [V16SI V16SF V8DI V8DF]) +(define_mode_iterator VI48F_I12 + [V16SI V16SF V8DI V8DF V32HI V64QI]) (define_mode_iterator VI48F [V16SI V16SF V8DI V8DF (V8SI "TARGET_AVX512VL") (V8SF "TARGET_AVX512VL") @@ -17475,7 +17476,7 @@ }) (define_expand "vec_init<mode>" - [(match_operand:VI48F_512 0 "register_operand") + [(match_operand:VI48F_I12 0 "register_operand") (match_operand 1)] "TARGET_AVX512F" {